Compare commits

..

554 Commits

Author SHA1 Message Date
Johannes Gäßler
9e4475f5cf Fixed OpenCL offloading prints (#2082) 2023-07-05 08:58:05 +02:00
Nigel Bosch
7f0e9a775e embd-input: Fix input embedding example unsigned int seed (#2105) 2023-07-05 07:33:33 +08:00
Georgi Gerganov
b472f3fca5 readme : add link web chat PR 2023-07-04 22:25:22 +03:00
Georgi Gerganov
ed9a54e512 ggml : sync latest (new ops, macros, refactoring) (#2106)
- add ggml_argmax()
- add ggml_tanh()
- add ggml_elu()
- refactor ggml_conv_1d() and variants
- refactor ggml_conv_2d() and variants
- add helper macros to reduce code duplication in ggml.c
2023-07-04 21:54:11 +03:00
jwj7140
f257fd2550 Add an API example using server.cpp similar to OAI. (#2009)
* add api_like_OAI.py
* add evaluated token count to server
* add /v1/ endpoints binding
2023-07-04 21:06:12 +03:00
Tobias Lütke
7ee76e45af Simple webchat for server (#1998)
* expose simple web interface on root domain

* embed index and add --path for choosing static dir

* allow server to multithread

because web browsers send a lot of garbage requests we want the server
to multithread when serving 404s for favicon's etc. To avoid blowing up
llama we just take a mutex when it's invoked.


* let's try this with the xxd tool instead and see if msvc is happier with that

* enable server in Makefiles

* add /completion.js file to make it easy to use the server from js

* slightly nicer css

* rework state management into session, expose historyTemplate to settings

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-07-04 16:05:27 +02:00
Henri Vasserman
acc111caf9 Allow old Make to build server. (#2098)
Also make server build by default.

Tested with Make 3.82
2023-07-04 15:38:04 +03:00
ZhouYuChen
23c7c6fc91 Update Makefile: clean simple (#2097) 2023-07-04 14:15:16 +02:00
Erik Scholz
698efad5fb CI: make the brew update temporarily optional. (#2092)
until they decide to fix the brew installation in the macos runners.
see the open issues. eg https://github.com/actions/runner-images/pull/7710
2023-07-04 01:50:12 +02:00
Govlzkoy
14a2cc71f6 [ggml] fix index for ne03 value in ggml_cl_mul_f32 (#2088) 2023-07-04 07:50:00 +08:00
Henri Vasserman
1cf14ccef1 fix server crashes (#2076) 2023-07-04 00:05:23 +03:00
Howard Su
cc45a7feb8 Fix crash of test-tokenizer-0 under Debug build (#2064)
* Fix crash of test-tokenizer-0 under Debug build

* Change per comment
2023-07-03 20:43:55 +02:00
Howard Su
55dbb915cc [llama] No need to check file version when loading vocab score (#2079) 2023-07-03 19:58:58 +08:00
WangHaoranRobin
d7d2e6a0f0 server: add option to output probabilities for completion (#1962)
* server: add option to output probabilities for completion
* server: fix issue when handling probability output for incomplete tokens for multibyte character generation
* server: fix llama_sample_top_k order
* examples/common.h: put all bool variables in gpt_params together
2023-07-03 00:38:44 +03:00
Georgi Gerganov
46088f7231 ggml : fix build with OpenBLAS (close #2066) 2023-07-02 09:46:46 +03:00
Johannes Gäßler
0bc2cdfc87 Better CUDA synchronization logic (#2057) 2023-07-01 21:49:44 +02:00
Johannes Gäßler
befb3a3562 Test-based VRAM scratch size + context adjustment (#2056) 2023-07-01 21:47:26 +02:00
Daniel Drake
b213227067 cmake : don't force -mcpu=native on aarch64 (#2063)
It's currently not possible to cross-compile llama.cpp for aarch64
because CMakeLists.txt forces -mcpu=native for that target.

-mcpu=native doesn't make sense if your build host is not the
target architecture, and clang rejects it for that reason, aborting the
build. This can be easily reproduced using the current Android NDK to build
for aarch64 on an x86_64 host.

If there is not a specific CPU-tuning target for aarch64 then -mcpu
should be omitted completely. I think that makes sense, there is not
enough variance in the aarch64 instruction set to warrant a fixed -mcpu
optimization at this point. And if someone is building natively and wishes
to enable any possible optimizations for the host device, then there is
already the LLAMA_NATIVE option available.

Fixes #495.
2023-07-01 21:31:44 +03:00
Aaron Miller
2f8cd979ec metal : release buffers when freeing metal context (#2062) 2023-07-01 21:14:59 +03:00
Judd
471aab6e4c convert : add support of baichuan-7b (#2055)
Co-authored-by: Judd <foldl@boxvest.com>
2023-07-01 20:00:25 +03:00
Georgi Gerganov
463f2f4c4f llama : fix return value of llama_load_session_file_internal (#2022) 2023-07-01 19:05:09 +03:00
Rand Xie
cb44dbc7de llama : catch llama_load_session_file_internal exceptions (#2022)
* convert checks in llama_load_session_file to throw and handle them

* make llama_load_session_file_internal static

* address feedbacks to avoid using exceptions
2023-07-01 19:02:58 +03:00
Georgi Gerganov
79f634a19d embd-input : fix returning ptr to temporary 2023-07-01 18:46:00 +03:00
Georgi Gerganov
04606a1599 train : fix compile warning 2023-07-01 18:45:44 +03:00
Qingyou Meng
b1ca8f36a9 ggml : disable GGML_TASK_INIT and GGML_TASK_FINALIZE by default (#1995)
Will not be scheduled unless explicitly enabled.
2023-07-01 18:42:43 +03:00
Howard Su
b8c8dda75f Use unsigned for random seed (#2006)
* Use unsigned for random seed. Keep -1 as the value to use a time based seed.

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-29 06:15:15 -07:00
LostRuins
96a712ca1b Porting the improved K-Quant CUDA kernels to OpenCL (#1966)
* Added broken new q4k quant

* xx + ib0

* Fix q2_k fast kernel

* Use preprocessor for QK_K

* Add q6_k fast matmul kernel

* ported q3k speedup successfully

* ported q2k and q5k speedups

* remove old dot kernels and template

* fixed global const struct types

* fixing address spaces

* fixed string too long CI issue

---------

Co-authored-by: 0cc4m <picard12@live.de>
2023-06-29 05:56:43 +02:00
m3ndax
d3494bb86b llama : replacing auto &kv with const auto &kv (#2041)
* Replacing auto &kv with const auto &kv

* Create codacy.yml

* Delete codacy.yml
2023-06-28 21:39:08 +03:00
Salvador E. Tropea
5b351e94d0 cuda : remove nchannels_x argument from mul_mat_vec_nc_f16_f32 (#2028)
- Not used
2023-06-28 20:27:31 +03:00
Salvador E. Tropea
6432aabb6d cuda : fix missing const qualifier in casts (#2027) 2023-06-28 20:26:26 +03:00
Howard Su
b922bc351b llama : remove shards weight file support (#2000)
* Remove multiple shards

* Remove multiple file loaders

* Remove llama_load_tensor_shard class

* Simplify load logic

* Remove dead code guess_n_parts function

* Remove vocab_only from constructor of llama_model_loader

* Remove alignment_prevents_mmap which is not more needed.

* Remove useless check
2023-06-28 20:13:02 +03:00
Johannes Gäßler
7f9753fa12 CUDA GPU acceleration for LoRAs + f16 models (#1970) 2023-06-28 18:35:54 +02:00
ningshanwutuobang
cfa0750bc9 llama : support input embeddings directly (#1910)
* add interface for float input

* fixed inpL shape and type

* add examples of input floats

* add test example for embd input

* fixed sampling

* add free for context

* fixed add end condition for generating

* add examples for llava.py

* add READMD for llava.py

* add READMD for llava.py

* add example of PandaGPT

* refactor the interface and fixed the styles

* add cmake build for embd-input

* add cmake build for embd-input

* Add MiniGPT-4 example

* change the order of the args of llama_eval_internal

* fix ci error
2023-06-28 18:53:37 +03:00
Erik Scholz
9d23589d63 fix pthreads setaffinity usage on android (#2020) 2023-06-27 19:06:33 +02:00
Howard Su
0be54f75a6 baby-llama : fix build after ggml_rope change (#2016) 2023-06-27 08:07:13 +03:00
Georgi Gerganov
181e8d9755 llama : fix rope usage after ChatGLM change 2023-06-27 00:37:33 +03:00
Georgi Gerganov
d9779021bd ggml : add support for ChatGLM RoPE 2023-06-27 00:06:51 +03:00
Roman Parykin
d38e451578 readme : add Scala 3 bindings repo (#2010) 2023-06-26 22:47:59 +03:00
David Yang
eaa6ca5a61 ggml : increase max tensor name + clean up compiler warnings in train-text (#1988)
* Clean up compiler warnings in train-text

Some brackets to disambiguate order of operations

* Increase GGML_MAX_NAME

Avoiding strncpy danger in train-text-from-scratch and reducing potential future name length issues
2023-06-26 22:45:32 +03:00
Gustavo Rocha Dias
aa777abbb7 readme : LD_LIBRARY_PATH complement for some Android devices when building with CLBlast inside Termux (#2007)
* docs - Alternative way to build at Android, with CLBlast.

* doc - LD_LIBRARY_PATH complement for some Android devices when building with CLBlast inside Termux.

* doc- fix typo
2023-06-26 22:34:45 +03:00
Georgi Gerganov
c824d2e368 ggml : avoid conv 2d kernel round up 2023-06-26 21:03:59 +03:00
zrm
b853d45601 ggml : add NUMA support (#1556)
* detect NUMA systems and pin work threads to nodes (linux)

* disable mmap prefetch/readahead for NUMA systems

* avoid sending finalize op to thread pool if it does nothing

* silence robot

* fix args

* make --numa a param

* recommendation that n_nodes evenly divide n_threads did not warrant such aggressive enforcement

* lower synchronization overhead

* statically allocate

* move numa state to g_state

* add description for --numa

* ggml : minor style changes

* ggml : minor style + try fix sanitizer build

* llama : allow to initialize backend with NUMA support

* llama : avoid ggml include in llama-util.h

* ggml : style / formatting

* ggml : fix handling of ops with n_threads > n_tasks > 1

* server : utilize numa parameter

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-26 20:57:59 +03:00
Georgi Gerganov
9225baef71 k-quants : fix indentation 2023-06-26 20:10:52 +03:00
katsu560
a84ab1da8d tests : fix quantize perf (#1990)
* fix test quantize perf

* avoid the global state
2023-06-26 19:47:02 +03:00
katsu560
5743ca8092 k-quants : add AVX support to dot functions (#1916)
* k_quants : add AVX support

* k_quants : apply review comments
2023-06-26 19:46:07 +03:00
Georgi Gerganov
412c60e473 readme : add link to new k-quants for visibility 2023-06-26 19:45:09 +03:00
Kawrakow
6769e944c7 k-quants : support for super-block size of 64 (#2001)
* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-26 19:43:07 +03:00
Howard Su
cbebf61ca7 Fix assert when free invalid cuda pointer (#2005)
Fix assert via initializing extra structure always.
CUDA error 1 at C:\GPT\llama.cpp\ggml-cuda.cu:2536: invalid argument
2023-06-26 23:15:47 +08:00
Georgi Gerganov
447ccbe8c3 readme : add new roadmap + manifesto 2023-06-25 16:08:12 +03:00
Georgi Gerganov
bd34cdde38 ggml : sync latest ggml (custom operators) 2023-06-25 14:25:08 +03:00
anon998
c2a08f87b8 fix server sampling: top k sampler first (#1977)
Co-authored-by: anon <anon@example.org>
2023-06-25 10:48:36 +02:00
Georgi Gerganov
66a2555ba6 readme : add Azure CI discussion link 2023-06-25 09:07:03 +03:00
sjinzh
e65ca7e14a zig : upgrade build system support (#1981)
* upgrade zig build system support

* zig : add new line at the end of the file

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-25 08:45:44 +03:00
Robyn
5ec8dd5a3c #1869 Fix null reference errors when training from scratch with CUDA (#1907)
* #1869 Fix null reference errors when training from scratch with CUDA build

Calling ggml_compute_forward when node->src0 was null was causing train-text-from-scratch.exe to terminate unexpectedly.

* ggml : do not dereference src0 if NULL

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-24 20:10:29 +02:00
Georgi Gerganov
65bdd52a86 tests : sync test-grad0 from ggml 2023-06-24 19:40:18 +03:00
Rowan Hart
fdd1860911 flake : fix ggml-metal.metal path and run nixfmt (#1974) 2023-06-24 14:07:08 +03:00
AN Long
c943d823c1 convert : fix invalid params in write_vocab_only (#1975) 2023-06-24 14:02:06 +03:00
slaren
f2c754e1c3 ggml : improve ggml_graph_dump_dot, add ggml_format_name (#1978)
* Improve ggml_graph_dump_dot, add ggml_format_name

* add more automatic names to view ops

* fix name of copies
2023-06-24 13:57:18 +03:00
Georgi Gerganov
11da1a85cd readme : fix whitespaces 2023-06-24 13:38:18 +03:00
Alberto
235b610d65 readme : fixed termux instructions (#1973) 2023-06-24 13:32:13 +03:00
Alex Renda
b061ba9e2a llama : fix top-p sampling to match the canonical definition (#1953)
* Fix top-p sampling to match the standard definition (smallest set that has probability mass at least p, not largest set with probability mass less than p)

* top-p: correct gt to gte

* add test for correct top-p behavior
2023-06-24 13:15:01 +03:00
Didzis Gosko
527b6fba1d llama : make model stateless and context stateful (llama_state) (#1797)
* llama : make model stateless and context stateful

* llama : minor cleanup

* llama : update internal API declaration

* Apply suggestions from code review

fix style

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Missing model memory release

* Fix style

* Add deprecated warning for public API function llama_init_from_file

* Update public API use cases: move away from deprecated llama_init_from_file

* Deprecate public API function llama_apply_lora_from_file

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-24 11:47:58 +03:00
eiery
d7b7484f74 Add OpenLLaMA instructions to the README (#1954)
* add openllama to readme
2023-06-23 10:38:01 +02:00
Erik Scholz
7487137227 rework convert.py to read hyper-parameters from config.json (#1958)
* Read hyper-parameters from HuggingFace-transformer config.json, if they exist, and fall back to guessing, like before otherwise.
  This allows converting open_llama 3B and other non-standard model designs.
2023-06-22 14:20:47 +02:00
Johannes Gäßler
bbca06e269 cmake: revert CUDA arch default to 52, 61 if f16 (#1959) 2023-06-21 23:49:25 +02:00
Rahul Vivek Nair
fb98254f99 Fix typo in README.md (#1961) 2023-06-21 23:48:43 +02:00
Georgi Gerganov
049aa16b8c readme : add link to p1 2023-06-20 19:05:54 +03:00
Xiake Sun
2322ec223a Fix typo (#1949) 2023-06-20 15:42:40 +03:00
Ettore Di Giacinto
aacdbd4056 llama : fix params struct slignment (#1936)
* Workaround struct misalignment during value-copy

Signed-off-by: mudler <mudler@localai.io>

* Move booleans at the bottom of the structure

Signed-off-by: mudler <mudler@localai.io>

* Add comment

Signed-off-by: mudler <mudler@localai.io>

---------

Signed-off-by: mudler <mudler@localai.io>
2023-06-20 04:24:39 +03:00
Henri Vasserman
20568fe60f [Fix] Reenable server embedding endpoint (#1937)
* Add back embedding feature

* Update README
2023-06-20 01:12:39 +03:00
Georgi Gerganov
18b35625c3 ggml : fix bug in LBFGS optimizer (found by ggml tests) 2023-06-19 20:43:30 +03:00
l3utterfly
ba4e85a833 llama : use aligned memory during ggml_init call from loading saved sessions (#1934)
* fixed issue: memory is not guaranteed to be aligned properly during ggml_init call from loading saved sessions

* - removed commented out old code from fix
- updated another instance of same issue below original
2023-06-19 18:20:06 +03:00
Georgi Gerganov
23fc5c219a cmake : fix trailing whitespaces 2023-06-19 18:18:34 +03:00
Kawrakow
cb40dfca69 llama : only use Q6_K for output weights if tensor size is multiple of 256 (#1932)
* Only use Q6_K for output weights if tensor size is multiple of 256

* Fixed copy/paste mistake

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-19 18:17:03 +03:00
Kawrakow
ca7c3f4da5 cuda : faster k-quants on older GPUs (#1930)
* k_quants: hopefully much faster Q4_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 65.5 ms/tok
to 41.5 ms/tok!

* k_quants: hopefully much faster Q3_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 60.3 ms/tok
to 41.0 ms/tok!

* k_quants: faster Q2_K on older GPUs

It looks like I didn't need to change anything
compared to what we already had, so this is just
adding clarifying comments. But I now measure
36.3 ms/tok on the GTX-1660, instead fo the
47.2 ms/tok that I have written in the faster
k-quants PR.

* k_quants: faster Q5_K on older GPUs

68.5 ms/tok -> 62.0 ms/tok on GTX-1660.
For some reason the same access pattern that leads
to such resounding success for Q2_K to Q4_K did not
work at all for Q5_K.

It is also more difficult to measure because for Q5_K_S
we only have 32 layers on the GTX-1660, so output, tok embeddings
and kv cache are done on the CPU.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-19 18:14:09 +03:00
Georgi Gerganov
b97ca431db ggml : sync latest ggml repo (#1924)
* ggml : sync latest ggml repo

* ggml : remove unused comments

* ggml : asserts
2023-06-19 18:12:33 +03:00
Howard Su
1e3abfcef0 cmake : fix build shared ggml when CUDA is enabled (#1929)
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-19 18:10:37 +03:00
Johannes Gäßler
16b9cd1939 Convert vector to f16 for dequantize mul mat vec (#1913)
* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
2023-06-19 10:23:56 +02:00
Johannes Gäßler
b24c3049d9 Added tokens per second to info prints (#1928) 2023-06-18 17:41:26 +02:00
Johannes Gäßler
0ede372a51 Fixed incorrectly applying RMS norm twice (#1925) 2023-06-18 16:07:09 +02:00
l3utterfly
8596af4277 ggml : fix bug in ggml_compute_forward_add_q_f32 (#1918) 2023-06-18 14:19:16 +03:00
Mike
e1886cf4fe readme : update Android build instructions (#1922)
Add steps for using termux on android devices to prevent common errors.
2023-06-18 11:28:26 +03:00
Kawrakow
8ab8ba62eb llama : prevent usage of k-quants when tensor size is not a multiple of 256 (#1921)
* Fix examples/metal

* k-quants: prevent usage when tensor size is not divisible by 256

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-18 11:13:43 +03:00
Kawrakow
90cc59d6ab examples : fix examples/metal (#1920)
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-18 10:52:10 +03:00
Georgi Gerganov
ce2c7d72e2 metal : handle buffers larger than device's maxBufferLength (#1826)
* metal : handle buffers larger than device's maxBufferLength

* metal : print more verbose device info + handle errors

* metal : fix prints for overlapping views

* metal : minimize view overlap to try to utilize device memory better
2023-06-18 09:09:47 +03:00
Howard Su
57cd69460f cmake : add CUDA_ARCHITECTURES to new target ggml_static (#1917) 2023-06-18 07:29:47 +03:00
Georgi Gerganov
b2416493ab make : do not print help for simple example 2023-06-17 20:55:03 +03:00
Georgi Gerganov
4f9c43e3bd minor : warning fixes 2023-06-17 20:24:11 +03:00
Johannes Gäßler
2c9380dd2f Only one CUDA stream per device for async compute (#1898) 2023-06-17 19:15:02 +02:00
Georgi Gerganov
051e1b0e6a llama : fix kv_cache n init (close #1903) 2023-06-17 19:31:20 +03:00
DaniAndTheWeb
86c7571864 make : update for latest Arch (#1701)
With the upcoming change to the openblas package in arch the Makefile workaround is no longer needed.
2023-06-17 19:17:22 +03:00
Howard Su
3d59ec5935 ggml : fix warnings under MSVC (#1908) 2023-06-17 18:46:15 +03:00
Aaron Miller
0711a5f6dc metal : add norm, cpy f16->f16, alibi kernels (#1823) 2023-06-17 17:37:49 +03:00
Faez Shakil
fc45a81bc6 exposed modules so that they can be invoked by nix run github:ggerganov/llama.cpp#server etc (#1863) 2023-06-17 14:13:05 +02:00
Randall Fitzgerald
794db3e7b9 Server Example Refactor and Improvements (#1570)
A major rewrite for the server example.

Note that if you have built something on the previous server API, it will probably be incompatible.
Check out the examples for how a typical chat app could work.

This took a lot of effort, there are 24 PR's closed in the submitter's repo alone, over 160 commits and a lot of comments and testing.

Summary of the changes:

- adds missing generation parameters: tfs_z, typical_p, repeat_last_n, repeat_penalty, presence_penalty, frequency_penalty, mirostat, penalize_nl, seed, ignore_eos
- applies missing top k sampler
- removes interactive mode/terminal-like behavior, removes exclude parameter
- moves threads and batch size to server command-line parameters
- adds LoRA loading and matches command line parameters with main example
- fixes stopping on EOS token and with the specified token amount with n_predict 
- adds server timeouts, host, and port settings
- adds expanded generation complete response; adds generation settings, stop reason, prompt truncated, model used, and final text
- sets defaults for unspecified parameters between requests
- removes /next-token endpoint and as_loop parameter, adds stream parameter and server-sent events for streaming
- adds CORS headers to responses
- adds request logging, exception printing and optional verbose logging
- adds better stopping words handling when matching multiple tokens and while streaming, or when it finishes on a partial stop string
- adds printing an error when it can't bind to the host/port specified
- fixes multi-byte character handling and replaces invalid UTF-8 characters on responses
- prints timing and build info on startup
- adds logit bias to request parameters
- removes embedding mode
- updates documentation; adds streaming Node.js and Bash examples
- fixes code formatting
- sets server threads to 1 since the current global state doesn't work well with simultaneous requests
- adds truncation of the input prompt and better context reset
- removes token limit from the input prompt
- significantly simplified the logic and removed a lot of variables

---------

Co-authored-by: anon998 <131767832+anon998@users.noreply.github.com>
Co-authored-by: Henri Vasserman <henv@hot.ee>
Co-authored-by: Felix Hellmann <privat@cirk2.de>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Lesaun Harvey <Lesaun@gmail.com>
2023-06-17 14:53:04 +03:00
Jiří Podivín
5ddf7ea1fb hooks : setting up flake8 and pre-commit hooks (#1681)
Small, non-functional changes were made to non-compliant files.
These include breaking up long lines, whitespace sanitation and
unused import removal.

Maximum line length in python files was set to a generous 125 chars,
in order to minimize number of changes needed in scripts and general
annoyance. The "txt" prompts directory is excluded from the checks
as it may contain oddly formatted files and strings for a good reason.

Signed-off-by: Jiri Podivin <jpodivin@gmail.com>
2023-06-17 13:32:48 +03:00
Gustavo Rocha Dias
bac19927c3 readme : alternative way to build for Android with CLBlast. (#1828) 2023-06-17 12:01:06 +03:00
Kerfuffle
b4c6f46f17 Allow cmake to build ggml as a library (#1896)
* Allow cmake to build ggml as a library

* A ggml_static library will be created

* When BUILD_SHARED_LIBS is enabled, ggml_shared will also be built
2023-06-17 01:49:42 -06:00
David Yang
92f20d9942 train : get raw text instead of page with html (#1905)
We probably want to train using just the text of Shakespeare instead of the html of the page displaying his work.
2023-06-17 09:51:54 +03:00
0cc4m
d411968e99 opencl : support k-quants (#1836)
* Porting q2_k kernel to OpenCL

* Set global and local sizes for kernel calls for dequantizing k-quants

* Added q6_k kernel

* Fix q4_k opencl struct order

* Replace uchar with uint8_t

* Finish dequant kernels

* Added OpenCL DMMV kernels

* Fix q2_k, improve code

* Fix q3_k

* Shorten switch statements

* Improve code formatting

---------

Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
2023-06-16 21:59:49 +03:00
SuperUserNameMan
b41b4cad6f examples : add "simple" (#1840)
* Create `simple.cpp`

* minimalist example `CMakeLists.txt`

* Update Makefile for minimalist example

* remove 273: Trailing whitespace

* removed trailing white spaces simple.cpp

* typo and comments simple.cpp

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-16 21:58:09 +03:00
Zenix
13fe9d2d84 cmake : add auto detection of BLAS_INCLUDE_DIRS (#1886) 2023-06-16 21:53:04 +03:00
Johannes Gäßler
ac3b886953 llama : fix embd when offloading non-repeating layers (#1891) 2023-06-16 21:25:51 +03:00
FrankHB
5b9ccaf104 Fixed possible macro redefinition (#1892)
MinGW libstdc++ may define `NOMINMAX` unconditionally. This fixes the case when it is already defined.
2023-06-16 21:25:01 +03:00
Borislav Stanimirov
9cbf50c041 build : fix and ignore MSVC warnings (#1889) 2023-06-16 21:23:53 +03:00
Kawrakow
3d01122610 CUDA : faster k-quant dot kernels (#1862)
* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-16 20:08:44 +03:00
Borislav Stanimirov
602c748863 gitignore : add several entries specific to Visual Studio (#1888) 2023-06-16 09:58:11 +03:00
Johannes Gäßler
a09f9195be Fixed CUDA runtime version check (#1879) 2023-06-15 21:49:08 +02:00
Georgi Gerganov
bed9275617 cmake : remove whitespaces 2023-06-15 21:56:50 +03:00
yangli2
c36e81da62 examples : add chat-vicuna.sh (#1854)
Co-authored-by: Yang Li <yangliyl@google.com>
2023-06-15 21:05:53 +03:00
Igor Okulist
3559433fec cmake : set include path for OpenBlas (#1830) 2023-06-15 20:51:26 +03:00
Frederik Vogel
69b34a0e80 swift : Package compile breaks due to ggml-metal.metal (#1831)
* Ignore metal file in spm

* Add ggml.h to spm public Headers

---------

Co-authored-by: Vogel Frederik <vogel.frederik@linecorp.com>
2023-06-15 20:47:04 +03:00
daboe01
cf267d1c71 make : add train-text-from-scratch (#1850)
* make finetuning example accessible

* fixed: targed was in wrong line

* fixed: name of executable was wrong

* fixed: naming of binary

* fixed: model path was wrong

* fixed clean target

* Update examples/train-text-from-scratch/README.md

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-15 20:42:48 +03:00
Srinivas Billa
9dda13e5e1 readme : server compile flag (#1874)
Explicitly include the server make instructions for C++ noobsl like me ;)
2023-06-15 20:36:38 +03:00
sandyiscool
37e257c48e make : clean *.so files (#1857) 2023-06-15 20:36:06 +03:00
Howard Su
64cc19b4fe Fix the validation of main device (#1872) 2023-06-15 19:29:59 +02:00
Georgi Gerganov
4bfcc855ab metal : parallel command buffer encoding (#1860)
* metal : parallel command buffer encoding

* metal : determine number of command buffers based on gf->n_threads
2023-06-15 20:29:48 +03:00
Johannes Gäßler
6b8312e797 Better error when using both LoRA + GPU layers (#1861) 2023-06-15 19:06:46 +02:00
Johannes Gäßler
254a7a7a5f CUDA full GPU acceleration, KV cache in VRAM (#1827)
* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
2023-06-14 19:47:19 +02:00
0xspringtime
9254920265 baby-llama : fix operator!= (#1821)
* Update baby-llama.cpp

Seems to be an error in the implementation of the operator!= function. It attempts to compare the this pointer (a llama_hparams_lora object) with the other pointer (a llama_hparams object) using memcmp. This can lead to incorrect results because the sizes of the objects being compared (sizeof(llama_hparams) and sizeof(llama_hparams_lora)) are different, should now be able to compare two llama_hparams_lora objects for inequality.

* Update baby-llama.cpp

* Update baby-llama.cpp
2023-06-13 22:37:54 +03:00
xaedes
e32089b2c2 train : improved training-from-scratch example (#1652)
* add python wrapper

https://gist.github.com/abetlen/2b90e5f153f6efd00931d098de5c73ce

* fix decoding error. adds errors=ignore parameter

* add python bindings for functions to get and set the whole llama state
(rng, logits, embedding and kv_cache)

* update python bindings

* add text generating baby-llama from scratch example

* fix race condition bug in ggml_compute_forward_diag_mask_f32

* implement ggml_soft_max_back for more performant backward pass of soft_max

avoids creating big intermediate matrices of size n_embd x n_embd for llama layers and n_vocab x n_vocab for cross entropy loss

* improve softmax backward pass

go from quadratic runtime to linear runtime by simplifying the formulas

* fix race condition bug in non-inplace ggml_compute_forward_diag_mask_f32

memcpy needs to be synchronized across threads to avoid race conditions.
=> do it in INIT phase

* fix bug in ggml_compute_forward_soft_max_back_f32 on DEBUG build

* improve performance of mul_mat backward pass

avoid transpose by using mul_mat with swapped arguments

* avoid printing too much newlines in baby-llama-text

* activate threading in baby-llama-text

* add ggml_out_prod and use it for mul_mat backward pass for improved performance

performance stats report improvement from 37 seconds to 16 seconds runtime during my training tests

* better weight initialization improves training convergence at start

* better weight initialization improves training convergence at start

* improve ggml_out_prod performance

- change iteration order (>15s -> 10s runtime)
- parallelize over one more dimension: over dst matrix rows (10s -> <5s runtime)

* add llama sampler, shuffle samples and constrain sampling to tokens occurring in train data

* fix get_samples call, add model tensor names, increase model size, start training samples after newline

* save train trained model to checkpoint and load model to be trained from checkpoint

* use inplace functions where possible

* initialize rng with srand

* use different arguments for input and output checkpoint

* ggml fixes to support backward pass on inplace operations

* remove duplicate include

* fix cross entropy loss

- add target probabilities for each sample which is then used in cross entropy loss

* print used memory before and after optimization

* sample with non-greedy sampling parameters at the end of training

* add cmake target for baby-llama-text

* add ggml_add1_inplace to header

* enable gradient propagation for inplace add1 and scale operations

those functions backward passes don't need the original src0, so they also work when forward is inplace

* implement AdamW in ggml_opt_adam by adding weight decay parameter (default 0.001f)

also add a schedule parameter (default 1.0f) that can be used to scale alpha and decay according to learning schedule.
setting the decay parameter to zero disables AdamW resulting in normal Adam optimizer.

since the difference between Adam and AdamW is minimal it is not implemented as another optimizer, but integrated into the existing Adam optimizer.

* use inplace operations in cross_entropy_loss

* fix random weight initialization scale

* add missing default parameters for adam optimizer

* add ggml_opt_context, so that we can properly resume training

otherwise the optimizer states, tracking statistics about the error function and its derivates,
will reset to zero each time ggml_opt is called, hindering convergence on resumed training.

now the optimizer context and all its memory is stored in a separate struct.

* fix bug in llama_sample_token_mirostat_v2

when all candidates are filtered out through mu threshold, the following soft_max operation will fail.
so keep at least one.

* add forward function without using cache, for more performant training

during training on whole samples no cache is required.
removing the cache and simplifying the remaining code results in performance and memory usage improvement.

* print suppressed newline tokens as string "\n"

printing too much actual newlines is suppressed to avoid flooding the console.

* store optimizer state in training checkpoint and add learning schedule

persistent optimizer state allows to resume training without resetting the optimizer
learning schedule consists of linear warmup ramp followed by cosine decay with restarts

* remove unused functions

* fix bug in get_samples which corrupted training targets

* save checkpoint only when it was trained

* simplify code

* remove trailing whitespace

* simplify backward pass for SQRT

* replace inefficient repeat backward pass with dedicated repeat_back operation

* add ggml_cross_entropy_loss with backward pass for faster training

cross entropy loss can also be implemented using softmax and log, but as dedicated operation it is faster and especially avoids unnecessary memory overhead.

* add tests for cross_entropy_loss backward pass

finite differences regularly results in estimated gradient of zero, despite the backward pass giving non zero gradient.
_probably_ the finite differences fails due to numerical issues

* use ggml_cross_entropy_loss in text training example

* remove trailing whitespace

* slightly improve how cross entropy loss is compute

btw: directly implemented cross entropy loss seems to have way lower magnitudes than when implemented with softmax and log.
probably the input to log gets closer to zero due to float numerics.
maybe the multiplication by (1.0-eps)/sum is more accurate..

* add llama_get_vocab to get the vocabulary as output parameters

* set default model.type for unknown models with few layers

* add export of training checkpoint to llama compatible model file

* get vocabulary for exporting training checkpoint to llama compatible model file

* implement backward pass of flash attention

* bugfixes for backward pass of flash attention

* test flash attention backward pass

need to set loose error bounds to pass.
the finitie differences are close to numeric limits and often return quite different values than the backward pass.
reducing eps further lets the gradients vanish completely.
likewise setting eps to big results in wronger values.
the softmax in the middle of the function is probably the most responsible for the numeric issues using finite differences.

* add option to train with flash attention and move options to the top of the main function

training from scratch also works with flash attention
training convergence and generation results after fix number of iterations are worse than when not using flash attention.
maybe there still lingers a bug in the flash attention backward pass?
but training works, just with slower convergence.

flash attention is still worth to use, because it requires way less memory and is faster with high n_ctx

* add train_params and command line option parser

* remove unnecessary comments

* add train params to specify memory size

* remove python bindings

* rename baby-llama-text to train-text-from-scratch

* replace auto parameters in lambda function

* add #include <climits>

* add explicit cast to fix compile error

"error: non-constant-expression cannot be narrowed from type 'int64_t' (aka 'long long') to 'uint32_t' (aka 'unsigned int') in initializer list [-Wc++11-narrowing]"

* remove trailing whitespace

* add ggml_opt_resume_g which accepts forward and backward cgraphs

* fix formulas in comments

* bug fix for ggml_compute_forward_get_rows_back_f32

the result should be set to zero, not to whatever data is in opt0

* improve training memory usage with scratch buffers

instead of relying on the automatic backward pass, we manually create the graph for the backward pass.
it turns out that all backward pass operations need only temporary memory which can be reused after each layer.

will compute backward pass for ALL model parameters

* add option to use scratch buffers in training or not

make it configurable because currently training with scratch buffers implies flash attention and optimization over all parameters.

* ci : disable temporary

* store view offset and permute axes in opt[0] instead of storing it in padding

use memcpy to store offset, because offset is of type size_t.
when storing it as int32_t offset would have to be smaller than 2^31 which is not necessarily true.

* minor : fix compile warnings + minor style changes

* fix bug in threaded indices calculation of ggml_compute_forward_flash_attn_back_f32

* store view offset like in master branch

* bug fix in forward_batch_wo_cache_flash_attn_train

* scratch buffer bug fixes in forward_batch_wo_cache_flash_attn_train

data of permute and reshape is the same as their input.
if we want to preserve the output of permute/reshape, we also need to preserve their inputs.

replace reshape(src0, src1) with reshape_nd calls so that we don't need src1.

replace (temporary) t03 with ggml_repeat(ctx0, layer.attention_norm, t02).
in the future we could also use the new broadcasting ggml_mul to avoid these repeat calls.
for this we need backward pass of broadcasting ggml_mul.

* remove unnecessary scratch buffer 0

buf 0 is persistent memory, so we can just disable scratch for this by using buf -1

* avoid creating unnecessary grad tensors

previously we need to create grads for model parameters, so that expand(..) correctly populates cgraph->leafs & cgraph->grads
this wasted memory, because unnecessary grad for each op were automatically created:
the automatically generated grad was unnecessary because we later manually set the grad (e.g. t35->grad = expand(gb, ...) ).
this discarded the automatically generated grad resulting in wasted memory.

improved this by changing expand(..) to not use ggml_build_forward_expand.
expand set cgraph->nodes but not the leafs.
cgraph->leafs & cgraph->grads are set in another pass after the last expand call.

* print used training seed

* zero initialize gfbuf and gbbuf

* ci : re-enable workflows + add README for training

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-13 22:04:40 +03:00
Georgi Gerganov
2347e45e7b llama : do a warm-up eval at start for better timings (#1824) 2023-06-13 20:20:07 +03:00
Kerfuffle
74d4cfa343 Allow "quantizing" to f16 and f32 (#1787)
* Allow "quantizing" to f16 and f32

Fix an issue where quantizing didn't respect LLAMA_NO_K_QUANTS

Add brief help to the list of quantization types in the quantize tool

Ignore case for quantization type arguments in the quantize tool
2023-06-13 04:23:23 -06:00
Kawrakow
74a6d922f1 Metal implementation for all k_quants (#1807)
* metal : improve q4_K

28.3 -> 26.0 ms/token by avoiding a branch in the
calculation of the scales.

* metal : small improvement for Q4_K

* metal : still optimizing Q4_K

This commit pushes it down to 25.3 ms / token.

The crazy idea of using 6 bits for the scales is really costly on
Metal: if I remove the bit fiddling necessary to make the block
scales, time goes almost to the Q4_0 23 ms/token.

Before pushing the k-quants upstream I had a Q4_K variant that
had used 8-bit scales. It wasn't more accurate, used 0.125 bits more per weight,
was running slightly slower on the CPU (due to the larger model size
and being memory bound there), and the difference was entirely
negligible under CUDA. So, I decided to publish the version with 6-bit
scales. Perhaps I should re-consider and change to 8-bit scales?

* metal : some more optimizations

Q2_K: 25.4 ms/token
Q6_K: 27.3 ms/token
Q4_0: 22.8 ms/token
Q4_1: 23.1 ms/token

* metal : Q3_K support

Something is not quite right yet.

* metal : Q5_K support

Initial version achieves 31.2 ms/token, 210 GB/s

* metal : still not able to figure out why q3_K does not work

* Minor

* metal : yet another failed attempt to make q3_K work

* metal : optimize Q5_K

31.2 ms -> 27.8 ms.
250 GB/s.

* metal : q3_K still not working

Adding a heavily commented q3_K metal kernel to explain
my obviously faulty logic. Perhaps someone could spot the issue?

* metal : q3_K finally working

Not optimized at all.

What was the issue? The scales are not 4-bytes aligned,
and I was accessing them with a uint32_t pointer.
When I tried that on CUDA, I got an error (illegal memory access)
and added a memcpy to a local array of 3 uint32_t's.
But on Metal it told me there is no memcpy, so I tried
accessing directly. There is no error, just garbage results.
At some point I did try accessing the scales with an uint16_t
pointer (the scales are for sure 2-byte aligned), but was
still getting garbage. I guess, there must have been another bug.

No access to scales is via a uint16_t pointer and, after starting
from scratch from the C dequantize function, it finally works.

* metal : Q3_K 1st optimization pass

* metal : Q3_K second optimization pass - 29.6 ms/token

* metal : Q3_K cleanup

* metal : fixed accidentally broken Q2_K

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-12 22:39:21 +03:00
slaren
e4caa8da59 ci : run when changing only the CUDA sources (#1800) 2023-06-12 20:12:47 +03:00
Howard Su
58970a4c39 Leverage mmap for offloading tensors to GPU (#1597)
* Rebase to latest

* Show progress

* Add assert to make sure we only allocate temp buffer for non-CPU backend tensor

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
2023-06-12 14:44:16 +02:00
Kawrakow
8c0a10e64d metal : fix failure to load model (#1817)
The number of buffers in the ggml context was left unitialized.
This leads to sporadic failures to load the model on
startup. It is actually strange that the failure occurred so
infrequantly.

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-12 14:31:36 +03:00
Kerfuffle
fa84c4b3e8 Fix issue where interactive mode crashes when input exceeds ctx size (#1789)
* Fix issue where interactive mode in the main example crashes when input exceeds ctx size

* Ensure the context size is at least 8 tokens in the main example.

Closes #1768
2023-06-11 08:19:17 -06:00
Kyle Liang
12b063f0ec Fixed WSL cuda's OOM error (#1594)
* In the function , add the cuda error bypass.

* remove excessive codes and prints

---------

Co-authored-by: liang <liangmanlai@126.com>
2023-06-11 15:20:52 +02:00
Ryan Landay
31d2b5f4a4 Update SHA256SUMS with current hashes for models quantized using q4_0 (#1798) 2023-06-11 12:38:53 +03:00
Georgi Gerganov
4de0334f5c cmake : fix Metal build (close #1791) 2023-06-10 22:56:53 +03:00
Artyom Lebedev
3f1223155a k-quants : GCC12 compilation fix (#1792) 2023-06-10 22:51:36 +03:00
Andrei
303f5809f1 metal : fix issue with ggml-metal.metal path. Closes #1769 (#1782)
* Fix issue with ggml-metal.metal path

* Add ggml-metal.metal as a resource for llama target

* Update flake.nix metal kernel substitution
2023-06-10 17:47:34 +03:00
Aisuko
059e99066d doc : fix wrong address of BLIS.md (#1772)
Signed-off-by: Aisuko <urakiny@gmail.com>
2023-06-10 17:08:11 +03:00
Georgi Gerganov
17c10acfb4 ggml : force no_alloc == false when creating opt tensors (close #1699)
This is needed to make operators like ggml_view() be able to store their
parameters in the ggml context's memory and not get discarded when
no_alloc is true
2023-06-10 12:08:15 +03:00
Kawrakow
e9b66ee982 metal : add Q4_1 implementation (#1785)
23.3 ms / token, so just ~1% slower than q4_0.
Achieves 290 GB/s memory throughput.

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-10 11:28:11 +03:00
Kerfuffle
4f0154b0ba llama : support requantizing models instead of only allowing quantization from 16/32bit (#1691)
* Add support for quantizing already quantized models

* Threaded dequantizing and f16 to f32 conversion

* Clean up thread blocks with spares calculation a bit

* Use std::runtime_error exceptions.
2023-06-10 10:59:17 +03:00
Xingchen Song(宋星辰)
ef3171d162 ggml : workaround for missing _mm256_setr_m128i in GCC < 8 (#1638) 2023-06-10 10:49:40 +03:00
rankaiyx
555275a693 make : add SSSE3 compilation use case (#1659) 2023-06-10 09:41:59 +03:00
Robert Sung-wook Shin
98ed165574 OpenCL: Add release memory (#1741)
* Add opencl release memory

* Rename function name
2023-06-09 18:24:40 +02:00
Johannes Gäßler
ae9663f188 Windows nvcc workaround (#1753)
Fix gibberish output on Windows when using CUDA
2023-06-09 13:58:15 +02:00
Georgi Gerganov
b33dee282f metal : fix build "tanhf" -> "tanh" 2023-06-09 11:11:04 +03:00
AT
92f44ff7f7 metal : add GELU implementation (#1770)
Co-authored-by: Adam Treat <adam@nomic.ai>
2023-06-09 11:00:51 +03:00
Kawrakow
245fc3c37d metal : faster q4_0 (#1775)
* metal : 8% faster q4_0

Avoid copying into local uchar4 anf float4.

* metal : 17% faster Q4_0

Use 64 threads in a thread group.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-09 10:39:59 +03:00
Kawrakow
72ff5282bf metal : add Q2_K implementation (#1762)
* metal : add Q2_K implementation

27.1 ms / token on M2 Max 30-core GPU, so about the
same speed as Q4_0. Memory throughput is ~156 GB/s.

The access pattern used in the Q2_K
CUDA implementation resulted in significantly lower
performance (~31 ms/token).

* Fixing merge conflicts

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-08 22:28:21 +03:00
Georgi Gerganov
0bf7cf1b29 Revert "ggml : load data into int8x16x4_t using vld4q_s8 on arm64 (#1738)"
This reverts commit 8432d4d9f7.
2023-06-08 20:48:14 +03:00
le.chang
8432d4d9f7 ggml : load data into int8x16x4_t using vld4q_s8 on arm64 (#1738) 2023-06-08 19:47:56 +03:00
Kawrakow
0f291e1f65 metal : Q6_K implementation (#1752)
* Metal implementation for Q4_K

Very slow for now:
42 ms / token, Q4_0 runs in 28 ms/token on my
30-core M2 Max GPU.

* Optimizing Q4_K on metal

The first token always takes longer, I guess because
the metal kernel is being jit-compiled.
So, using n = 128 to measure time.

At this point Q4_K takes 29.5 ms / token
compared to 27.2 ms / token for Q4_0.
Quite a bit better than the initial attempt,
but still not good enough.

* Optimizing q4_K metal dot some more

For n = 256 it is now 28.1 ms/token compared to
27 ms/token for q4_0.

* Fix after merge with master

* Metal implementation for Q6_K

Similar to the CUDA implementation.
No idea if this is the optimum for Metal, but the few
alternative variants I tried all had a lower performance.

We get 36.5 ms / token on M2 Max with 30 GPU cores.
This corresponds to ~200 GB/second throughput.

* clang-tidy : add config back

* Much better Q6_K implementation for metal

28.3 ms / token for 7B. Subtracting ~9 ms that is spent in
other compute graph operations, we are left with ~19 ms
for the matrix multiplications. The model is ~5.5 GB,
so we are getting 1000 / 19 * 5.5 = 290 GB/s!

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-08 19:46:22 +03:00
qingfengfenga
8fc8179919 Add llama.cpp docker support for non-latin languages (#1673)
* Modify Dockerfile default character set to improve compatibility (#1673)
2023-06-08 00:58:53 -07:00
Steven Roussey
b50b570ed9 ggml : fix fprintf warnings (#1720) 2023-06-08 10:12:28 +03:00
Georgi Gerganov
53aba3f393 clang-tidy : restore dot file from accidental deletion 2023-06-08 10:09:08 +03:00
Kawrakow
4161bdc04d metal : add Q4_K implementation (#1733)
* Metal implementation for Q4_K

Very slow for now:
42 ms / token, Q4_0 runs in 28 ms/token on my
30-core M2 Max GPU.

* Optimizing Q4_K on metal

The first token always takes longer, I guess because
the metal kernel is being jit-compiled.
So, using n = 128 to measure time.

At this point Q4_K takes 29.5 ms / token
compared to 27.2 ms / token for Q4_0.
Quite a bit better than the initial attempt,
but still not good enough.

* Optimizing q4_K metal dot some more

For n = 256 it is now 28.1 ms/token compared to
27 ms/token for q4_0.

* Fix after merge with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-08 10:08:23 +03:00
johnson442
0035858273 k-quants : add missing compile definition to CMakeLists (#1748) 2023-06-08 10:02:48 +03:00
Georgi Gerganov
5c64a0952e k-quants : allow to optionally disable at compile time (#1734)
* k-quants : put behind optional compile flag LLAMA_K_QUANTS

* build : enable k-quants by default
2023-06-07 10:59:52 +03:00
jacobi petrucciani
5b57a5b726 flake : update to support metal on m1/m2 (#1724) 2023-06-07 07:15:31 +03:00
Georgi Gerganov
4dc62c545d readme : add June roadmap 2023-06-07 07:15:08 +03:00
Willy Tarreau
35a84916fb main: add the possibility to open the prompt cache read-only (#1640)
The prompt cache constitutes a nice speed up when using the same prompt
prefix across multiple evaluations, but when using it, it will also be
updated, which is not always desirable. One use case is to have a large
prompt containing some context and usage rules, and a second part
containing variable data of the problem being studied. In this case it's
desirable to be able to save the first part once, and to always reuse it
as-is without updating it with the second part.

The new argument --prompt-cache-ro enables this read-only mode on the
prompt cache. The prompt's contents that match the cache are loaded
from the cache but the rest is not modified. This allowed to reduce a
total analysis time from 112s to 49.7s here, without having to backup
and restore a copy of the prompt, which takes significant time at 500
MB.

Signed-off-by: Willy Tarreau <w@1wt.eu>
2023-06-06 22:10:17 -04:00
Georgi Gerganov
2d7bf110ed llama : fix vram_scratch var 2023-06-06 22:54:39 +03:00
Georgi Gerganov
2a4e41a086 llama : fix compile warnings 2023-06-06 22:41:53 +03:00
Johannes Gäßler
17366df842 Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)
* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
2023-06-06 21:33:23 +02:00
Georgi Gerganov
44f906e853 metal : add f16 support 2023-06-06 20:21:56 +03:00
LostRuins
d5b111f53d Clblast fixes + enhancements to save VRAM and offload more layers (#1675)
* Use events instead of clFinish, where possible

* OpenCL: Don't load gpu layers into RAM, add mul_f32 kernel

* Reduce queueing overhead for contiguous tensors by using single mul kernel call

* Adapt to #1612 cl_mem malloc changes

* Reduce code duplication between cuda and opencl branches

* Improve implementation

* Clblast fixes + enhancements to save VRAM:

1. Change all Clblast buffers to CL_MEM_READ_WRITE, as the pool malloc currently doesn't properly handle them.
2. When recycling buffers in pool malloc, always assign the SMALLEST available buffer that fits, instead of the FIRST available buffer
3. When failing to recycle a buffer in pool malloc (all too small), instead recycle the largest available free buffer by resizing it.

* change max value size_t to use limits

* removed flags from the CL pool malloc, apply code tidying suggestions.
2023-06-06 19:00:01 +02:00
Georgi Gerganov
2d43387daf ggml : fix builds, add ggml-quants-k.o (close #1712, close #1710) 2023-06-06 10:18:03 +03:00
Georgi Gerganov
7ad7750c5c gitignore : add .clang-tidy 2023-06-06 09:55:25 +03:00
Georgi Gerganov
7a74dee6b4 llama : temporary disable Q6_K output quantization (#1711) 2023-06-06 09:39:38 +03:00
Spencer Sutton
590250f7a9 metal : add checks for buffer size (#1706)
Co-authored-by: Spencer Sutton <Spencer.Sutton@precisely.com>
2023-06-06 06:28:17 +03:00
Yuval Peled
f4c55d3bd7 docs : add performance troubleshoot + example benchmark documentation (#1674)
* test anchor link

* test table

* add benchmarks

* Add performance troubleshoot & benchmark

* add benchmarks

* remove unneeded line

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 23:32:36 +03:00
Foul-Tarnished
f1465624c2 readme : fix typo (#1700)
Fix a typo in a command in README.md
2023-06-05 23:28:37 +03:00
mgroeber9110
c2df36d60d llama : consistently catch and throw only exceptions deriving from std::exception (#1599)
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 23:24:29 +03:00
kiltyj
9d0693bce3 metal : use shared buffers between CPU and GPU (#1696)
* Use MTLDevice.newBufferWithBytesNoCopy to share buffers between CPU and GPU

* Page-align buffers used by Metal

* Remove trailing whitespace

* Only import unistd.h for Metal builds

* metal : remove unnecessary copies

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 23:24:04 +03:00
grahameth
efe0507632 ggml : fix internal overflow in ggml_time_us on Windows (#1702)
Co-authored-by: grahameth <->
2023-06-05 23:11:49 +03:00
Georgi Gerganov
e7fe66e670 ci : disable auto tidy (#1705) 2023-06-05 23:05:05 +03:00
Kawrakow
99009e72f8 ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 22:56:18 +03:00
Henri Vasserman
5220a991a5 Increase 3B scratch buffers. (#1698)
The 128 MB was too optimistic.
Too bad it is not dynamically computed.
2023-06-05 13:43:08 +03:00
Georgi Gerganov
d1f563a743 llama : fix Metal KV cache sync (close #1695) 2023-06-05 10:19:03 +03:00
Georgi Gerganov
827f5eda91 readme : update hot topics 2023-06-04 23:38:19 +03:00
Georgi Gerganov
ecb217db4f llama : Metal inference (#1642)
* mtl : export the LLaMA computation graph

* ci : disable temporary

* mtl : adapt the MNIST example as starter

* mtl : no need for mtl-export tool, add cli arg for main instead

* mtl : export just a small part of the graph for now to make it easier

* mtl : move MSL code into separate file for easy editing

* mtl : initial get_rows_q4_0 kernel

* mtl : confirmed get_rows_q4_0 is working correctly

* mtl : add rms_norm kernel + confirm working

* mtl : add mul kernel + confirm working

* mtl : initial mul_mat Q4 kernel (wrong results)

* mtl : mul_mat fixes (still wrong)

* mtl : another mul_mat Q4 (still does not work)

* mtl : working mul_mat q4

* ggml : fix handling of "view" ops in ggml_graph_import()

* mtl : add rope kernel

* mtl : add reshape and transpose handling

* ggml : store offset as opt arg for ggml_view_xd() operators

* mtl : add cpy kernel + handle view ops

* mtl : confirm f16 x f32 attention mul mat

* mtl : add scale kernel

* mtl : add diag_mask_inf kernel

* mtl : fix soft_max kernel

* ggml : update ggml_nbytes() to handle non-contiguous tensors

* mtl : verify V tensor contents

* mtl : add f32 -> f32 cpy kernel

* mtl : add silu kernel

* mtl : add non-broadcast mul kernel

* mtl : full GPU inference of the computation graph

* mtl : optimize rms_norm and soft_max kernels

* mtl : add f16 mat x f32 vec multiplication kernel

* mtl : fix bug in f16 x f32 mul mat + speed-up computation

* mtl : faster mul_mat_q4_0_f32 kernel

* mtl : fix kernel signature + roll inner loop

* mtl : more threads for rms_norm + better timing

* mtl : remove printfs from inner loop

* mtl : simplify implementation

* mtl : add save/load vocab to ggml file

* mtl : plug Metal inference into llama.cpp (very quick-n-dirty)

* mtl : make it work with main example

Lots of hacks but at least now it generates text

* mtl : preparing for merge

* mtl : clean-up ggml mtl interface + suport scratch / inplace

* mtl : remove temp / debug code

* metal : final refactoring and simplification

* Revert "ci : disable temporary"

This reverts commit 98c267fc77.

* metal : add comments

* metal : clean-up stuff, fix typos

* readme : add Metal instructions

* readme : add example for main
2023-06-04 23:34:30 +03:00
0cc4m
dcb2ed4826 OpenCL: Fix duplication of layers in VRAM and RAM, add GPU mul kernel (#1653)
* Use events instead of clFinish, where possible

* OpenCL: Don't load gpu layers into RAM, add mul_f32 kernel

* Reduce queueing overhead for contiguous tensors by using single mul kernel call

* Adapt to #1612 cl_mem malloc changes

* Reduce code duplication between cuda and opencl branches

* Improve implementation
2023-06-04 08:12:05 +02:00
Henri Vasserman
d8bd0013e8 Add info about CUDA_VISIBLE_DEVICES (#1682) 2023-06-03 16:35:20 +03:00
Jiří Podivín
b5c85468a3 Docker: change to calling convert.py (#1641)
Deprecation disclaimer was added to convert-pth-to-ggml.py
2023-06-03 15:11:53 +03:00
Evan Jones
136476e898 Fix prompt cache saving and chat-persistent rollover (#1678)
* Fix prompt cache saving and chat-persistent rollover (fixes #1670)

* clang-tidy

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
2023-06-03 07:28:45 -04:00
Henri Vasserman
ffb06a345e OpenLLaMA 3B support (#1588)
This adds support to llama.cpp to load the model.

Currently missing are changes that are required from convert.py to convert the model correctly. It needs some changes to start reading the JSON configuration for HF models instead of deriving the values by guessing.

Co-authored-by: FNsi <125447286+FNsi@users.noreply.github.com>
2023-05-30 21:24:22 +03:00
Georgi Gerganov
7552ac5863 ggml : sync cgraph import / export API 2023-05-29 19:31:44 +03:00
Georgi Gerganov
5d1830b99d ggml : fix bug in ggml_alibi 2023-05-29 19:30:49 +03:00
DannyDaemonic
248367605e Work around for recalculating logits in cached prompts (Fixes #1585) (#1609)
* Work around for recalculating logits in cached prompts
2023-05-29 05:13:40 -07:00
Jiří Podivín
0e730dd23b Adding git in container package dependencies (#1621)
Git added to build packages for version information in docker image

Signed-off-by: Jiri Podivin <jpodivin@gmail.com>
2023-05-28 21:45:50 -07:00
Johannes Gäßler
3b126f654f LLAMA_DEBUG adds debug symbols (#1617) 2023-05-28 21:01:02 +02:00
Kerfuffle
1b78ed2081 Only show -ngl option when relevant + other doc/arg handling updates (#1625)
1. Add a `LLAMA_SUPPORTS_GPU_OFFLOAD` define to `llama.h` (defined when compiled with CLBlast or cuBLAS)
2. Update the argument handling in the common example code to only show the `-ngl`, `--n-gpu-layers` option when GPU offload is possible.
3. Add an entry for the `-ngl`, `--n-gpu-layers` option to the `main` and `server` examples documentation
4. Update `main` and `server` examples documentation to use the new style dash separator argument format
5. Update the `server` example to use dash separators for its arguments and adds `-ngl` to `--help` (only shown when compiled with appropriate support). It will still support `--memory_f32` and `--ctx_size` for compatibility.
6. Add a warning discouraging use of `--memory-f32` for the `main` and `server` examples `--help` text as well as documentation. Rationale: https://github.com/ggerganov/llama.cpp/discussions/1593#discussioncomment-6004356
2023-05-28 11:48:57 -06:00
Vladimir Zorin
337aea1139 examples : add --alias option to gpt_params to set use friendly model name (#1614) 2023-05-28 20:14:24 +03:00
Howard Su
bb051d9723 opencl : no need to allocate cl_mem on heap (#1612) 2023-05-28 20:13:36 +03:00
Howard Su
ca74884f66 opencl : use strstr to check if fp16 supported (#1611)
* Use strstr to check if fp16 supported

* Ensure ext_buffer is null terminated
2023-05-28 20:09:56 +03:00
apcameron
a6704643b6 ggml : add support for the RISCV architecture (#1616) 2023-05-27 23:03:25 +03:00
Kerfuffle
0df7d63e5b Include server in releases + other build system cleanups (#1610)
Set `LLAMA_BUILD_SERVER` in workflow so the `server` example gets build. This currently only applies to Windows builds because it seems like only Windows binary artifacts are included in releases.

Add `server` example target to `Makefile` (still uses `LLAMA_BUILD_SERVER` define and does not build by default)

Fix issue where `vdot` binary wasn't removed when running `make clean`.

Fix compile warnings in `server` example.

Add `.hpp` files to trigger workflow (the server example has one).
2023-05-27 11:04:14 -06:00
Henri Vasserman
97c9b77c4f Add documentation about CLBlast (#1604)
Installing, compiling and using.
2023-05-27 18:47:55 +03:00
Henri Vasserman
0ecb1bbbeb [CI] Fix openblas (#1613)
* Fix OpenBLAS build

* Fix `LLAMA_BLAS_VENDOR` CMake variable that should be a string and not a boolean.
2023-05-27 17:24:06 +03:00
Georgi Gerganov
93618031c7 ggml : add ggml_tensor_overhead() 2023-05-27 16:19:56 +03:00
Henri Vasserman
83c54e6da5 [CI] CLBlast: Fix directory name (#1606) 2023-05-27 14:18:25 +02:00
Georgi Gerganov
bdbda1b17a ggml : sync ggml core (minor additions, e.g. ggml_get_tensor_by_name()) 2023-05-27 12:23:16 +03:00
Kerfuffle
66874d4fbc Some improvements to loading the session with --prompt-cache (#1550)
Improvements to loading the session with `--prompt-cache` in the `main` example.

1. Fix an issue where the `--seed` parameter was ignored when loading a cached prompt.
2. When loading a cached prompt, you previously had to specify the saved prompt (or a prefix of it) again. This pull changes that behavior to default to the prompt that was cached if a prompt wasn't specified by the user.
2023-05-25 20:18:01 -06:00
Johannes Gäßler
1fcdcc28b1 cuda : performance optimizations (#1530)
* xor hack

* block y dim

* loop unrolling

* Fixed cmake LLAMA_CUDA_BY option

* Removed hipblas compatibility code

* Define GGML_CUDA_DMMV_BLOCK_Y if not defined

* Fewer iters, more ops per iter

* Renamed DMMV X/Y compilation options
2023-05-26 00:07:29 +03:00
Henri Vasserman
ac7876ac20 Update CLBlast to 1.6.0 (#1580)
* Update CLBlast to 1.6.0
2023-05-24 10:30:09 +03:00
Evan Jones
c31bbe934b readme : add docs for chat-persistent.sh (#1568)
* readme : add docs for chat-persistent.sh

* Update README.md
2023-05-24 09:24:01 +03:00
Senemu
1359b6aba5 chat-persistent.sh : use bracket expressions in grep (#1564) 2023-05-24 09:16:22 +03:00
Maarten ter Huurne
7d873811f3 Fix handling of "invalid property" when creating OpenCL command queue (#1565)
The `clCreateCommandQueue()` function will return the code
`CL_INVALID_QUEUE_PROPERTIES` when passed unsupported properties,
not `CL_INVALID_PROPERTY` as the original code was checking for.
2023-05-23 19:01:15 +03:00
0cc4m
2e6cd4b025 OpenCL Token Generation Acceleration (#1459)
* Move back to C++ for OpenCL

* Refactor OpenCL code to work more like the CUDA code, add missing functions

* Deduplicate dequant kernels

* Add OpenCL compile options

* Use compile args for preprocessing constants

* Restore default platform + device selection by id behavior

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Henri Vasserman <henv@hot.ee>
2023-05-23 00:33:24 +03:00
Steward Garcia
7e4ea5beff examples : add server example with REST API (#1443)
* Added httplib support

* Added readme for server example

* fixed some bugs

* Fix the build error on Macbook

* changed json11 to nlohmann-json

* removed some whitespaces

* remove trailing whitespace

* added support custom prompts and more functions

* some corrections and added as cmake option
2023-05-21 20:51:18 +03:00
Stefan Sydow
7780e4f479 make : .PHONY clean (#1553) 2023-05-21 17:03:44 +03:00
Georgi Gerganov
265db9834e ggml : output 3d sizes in ggml_graph_dump_dot() 2023-05-21 11:56:23 +03:00
Georgi Gerganov
fab49c685e ggml : update WASM SIMD 2023-05-20 20:00:41 +03:00
Zenix
b8ee340abe feature : support blis and other blas implementation (#1536)
* feature: add blis support

* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927

* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake

* Fix typo in INTEGER

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Fix: blas changes on ci

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-05-20 17:58:31 +03:00
Henri Vasserman
9ecb30f959 OpenCL: Fixes for older devices. (#1435)
* Remove `constant`

* Rewrite platform and device selection

* Fix Q8_0
2023-05-20 17:57:39 +03:00
Juuso Alasuutari
29cf5596fe llama : define magic numbers as integer constants (#1518) (#1520)
The underlying representation of multibyte character literals is
implementation-defined. This could, at least in principle, cause
cross-build data export/import issues independent of endianness.

Define magic numbers as integer literals to be on the safe side.

Signed-off-by: Juuso Alasuutari <juuso.alasuutari@gmail.com>
2023-05-20 15:58:15 +03:00
Georgi Gerganov
3de84b2606 ggml : add ggml_clamp() (#1539)
* ggml : add ggml_clamp()

* ggml : indentation
2023-05-20 15:34:45 +03:00
Johannes Gäßler
affc76edfd cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)
* Broadcasting for ggml_mul

* CUDA kernel for ggml_mul, norms in VRAM

* GPU weights not in RAM, direct loading with cuFile

* fixup! GPU weights not in RAM, direct loading with cuFile

* fixup! GPU weights not in RAM, direct loading with cuFile

* define default model path once, sync path with readme (#1366)

* ~7% faster Q5_1 AVX2 code (#1477)

* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)

* Support models in a single pytorch_model.bin

* Remove spurious line with typo

* benchmark-matmul: Print the average of the test results (#1490)

* Remove unused n_parts parameter (#1509)

* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)

* Fix for w64devkit and mingw

* make kv_f16 the default for api users (#1517)

* minor : fix compile warnings

* readme : adds WizardLM to the list of supported models (#1485)

* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)

* Make reverse prompt option act as a stop token in non-interactive scenarios

* Making requested review changes

* Update gpt_params_parse and fix a merge error

* Revert "Update gpt_params_parse and fix a merge error"

This reverts commit 2bb2ff1748.

* Update gpt_params_parse and fix a merge error take 2

* examples : add persistent chat (#1495)

* examples : add persistent chat

* examples : fix whitespace

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* tests : add missing header

* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)

* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0

* llama : bump LLAMA_FILE_VERSION to 3

* cuda : update Q4 and Q8 dequantize kernels

* ggml : fix AVX dot products

* readme : update performance table + hot topics

* ggml : fix scalar implementation of Q4_1 dot

* llama : fix compile warnings in llama_set_state_data()

* llama : fix name shadowing and C4146 (#1526)

* Fix name shadowing and C4146

* Fix if macros not using defined when required

* Update llama-util.h

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Update llama-util.h

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Code style

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Fix for mingw (#1462)

* llama : add llama_init_backend() API (close #1527)

* feature : add blis and other BLAS implementation support (#1502)

* feature: add blis support

* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927

* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake

* Fix typo in INTEGER

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Revert "feature : add blis and other BLAS implementation support (#1502)"

This reverts commit 07e9ace0f9.

* GPU weights not in RAM, direct loading with cuFile

* llama : code style fixes + progress print fix

* ggml : ggml_mul better broadcast support

* cmake : workarounds for cufile when CMake version < 3.25

* gg rebase fixup

* Loop in llama.cpp, fixed progress callback

* Attempt clang-tidy fix

* llama : fix vram size computation

* Add forgotten fclose()

---------

Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
2023-05-20 15:19:28 +03:00
Georgi Gerganov
ea600071cb Revert "feature : add blis and other BLAS implementation support (#1502)"
This reverts commit 07e9ace0f9.
2023-05-20 12:03:48 +03:00
Zenix
07e9ace0f9 feature : add blis and other BLAS implementation support (#1502)
* feature: add blis support

* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927

* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake

* Fix typo in INTEGER

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-05-20 12:02:48 +03:00
Georgi Gerganov
ec2e10c444 llama : add llama_init_backend() API (close #1527) 2023-05-20 11:06:37 +03:00
DannyDaemonic
d2c59b8ba4 Fix for mingw (#1462) 2023-05-20 00:40:02 -07:00
Maxime
503db28849 llama : fix name shadowing and C4146 (#1526)
* Fix name shadowing and C4146

* Fix if macros not using defined when required

* Update llama-util.h

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Update llama-util.h

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Code style

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-05-20 10:22:37 +03:00
Georgi Gerganov
8a203f9fa1 llama : fix compile warnings in llama_set_state_data() 2023-05-20 10:14:43 +03:00
Georgi Gerganov
4fd3e29297 ggml : fix scalar implementation of Q4_1 dot 2023-05-20 10:13:19 +03:00
Georgi Gerganov
2d5db48371 ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)
* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0

* llama : bump LLAMA_FILE_VERSION to 3

* cuda : update Q4 and Q8 dequantize kernels

* ggml : fix AVX dot products

* readme : update performance table + hot topics
2023-05-19 22:17:18 +03:00
Georgi Gerganov
6986c7835a tests : add missing header 2023-05-19 21:17:28 +03:00
Evan Jones
943e6081cc examples : add persistent chat (#1495)
* examples : add persistent chat

* examples : fix whitespace

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-05-19 20:39:51 +03:00
Jason McCartney
7694b52b9a main : make reverse prompt option act as a stop token in non-interactive mode (#1032)
* Make reverse prompt option act as a stop token in non-interactive scenarios

* Making requested review changes

* Update gpt_params_parse and fix a merge error

* Revert "Update gpt_params_parse and fix a merge error"

This reverts commit 2bb2ff1748.

* Update gpt_params_parse and fix a merge error take 2
2023-05-19 20:24:59 +03:00
David Kennedy
79e3efb0e9 readme : adds WizardLM to the list of supported models (#1485) 2023-05-19 20:16:30 +03:00
Georgi Gerganov
4b7e245adf minor : fix compile warnings 2023-05-19 20:14:51 +03:00
Erik Scholz
5ea4339273 make kv_f16 the default for api users (#1517) 2023-05-18 19:31:01 +02:00
DannyDaemonic
ee9654138a Fixes #1511 lambda issue for w64devkit (mingw) (#1513)
* Fix for w64devkit and mingw
2023-05-18 19:30:40 +02:00
Stephan Walter
dc271c52ed Remove unused n_parts parameter (#1509) 2023-05-17 22:12:01 +00:00
rankaiyx
c238b5873a benchmark-matmul: Print the average of the test results (#1490) 2023-05-17 16:47:58 +02:00
Tom Jobbins
2b2646931b convert.py: Support models which are stored in a single pytorch_model.bin (#1469)
* Support models in a single pytorch_model.bin

* Remove spurious line with typo
2023-05-17 00:04:35 +02:00
Ilya Kurdyukov
42627421ec ~7% faster Q5_1 AVX2 code (#1477) 2023-05-16 18:36:47 +00:00
András Salamon
9560655409 define default model path once, sync path with readme (#1366) 2023-05-16 17:46:34 +02:00
sandyiscool
2a5ee023ad Add alternate include path for openblas (#1476)
In some linux distributions (fedora, for example), the include path for openblas is located at '/usr/local/include'
2023-05-16 10:30:15 +02:00
zrm
63d20469b8 fix get_num_physical_cores() (#1436)
* fix get_num_physical_cores()
had been broken on complex topologies because "cpu cores" in /proc/cpuinfo is per-"physical id"

* Add spaces to maintain consistent formatting

---------

Co-authored-by: slaren <ddevesa@gmail.com>
2023-05-15 04:25:42 +02:00
slaren
b5c9295eef benchmark-matmul: fix clang-tidy issues, report results in GFLOPS (#1458)
* benchmark-matmul: fix command line parsing, replace macros with functions, report results in GFLOPS
2023-05-14 22:46:00 +02:00
Johannes Gäßler
eb363627fd cuda : deduplicated dequantization code (#1453) 2023-05-14 21:53:23 +03:00
xaedes
79b2d5b69d ggml : alternative fix for race condition bug in non-inplace ggml_compute_forward_diag_mask_f32 (#1454)
* fix race condition bug in non-inplace ggml_compute_forward_diag_mask_f32

memcpy needs to be synchronized across threads to avoid race conditions.
=> do it in INIT phase

* remove trailing whitespace

* Update ggml.c

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-05-14 18:55:02 +03:00
Georgi Gerganov
13c351ad72 ggml : various fixes (#1450)
- `ggml_rope()`
- `ggml_diag_mask_inf()` multi-threaded
- compatibility with scratch buffers
2023-05-14 18:22:50 +03:00
katsu560
60f8c361ca ggml : add AVX support based on AVX2 code (#1430) 2023-05-14 10:03:51 +00:00
Georgi Gerganov
601a033475 ggml : add GGML_QNT_VERSION to track quantization format changes
https://github.com/ggerganov/ggml/issues/150#issuecomment-1546625668
2023-05-14 10:20:19 +03:00
Georgi Gerganov
08737ef720 cuda : fix convert function (#1412) 2023-05-13 17:40:58 +03:00
Georgi Gerganov
bda4d7c215 make : fix PERF build with cuBLAS 2023-05-13 17:25:09 +03:00
Georgi Gerganov
5a5aeb1e91 llama : fix unused warning 2023-05-13 16:55:14 +03:00
Georgi Gerganov
66841fdb0e ggml : multi-thread mul and diag_mask ops (#1428) 2023-05-13 16:48:03 +03:00
Johannes Gäßler
905d87b70a ggml : GPU-accelerated token generation (#1412)
* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-05-13 16:38:36 +03:00
xaedes
f954edda93 ggml : implement backward pass for llama + small training-llama-from-scratch example (#1360)
* implement 8 of 14 missing backward pass operations used by llama

- GGML_OP_ADD_AT
- GGML_OP_CPY
- GGML_OP_MUL_MAT (src0.grad)
- GGML_OP_PERMUTE
- GGML_OP_RESHAPE
- GGML_OP_SCALE
- GGML_OP_TRANSPOSE
- GGML_OP_VIEW

implement additional ggml operation GGML_OP_ADD_AT, which is necessary for backward pass of GGML_OP_VIEW.

this operation adds src1 to src0 with data offset, i.e. to view(src0, ..., offset).
the values are return in a tensor size of src0. values outside of [data+offset:data+offset+nbytes(src1)] are just the original values from src0.

still missing backward passes for llama:

- GGML_OP_DIAG_MASK_INF
- GGML_OP_GET_ROWS
- GGML_OP_RMS_NORM
- GGML_OP_ROPE
- GGML_OP_SILU
- GGML_OP_SOFT_MAX

* implement 5 of 6 missing backward pass operations used by llama

- GGML_OP_DIAG_MASK_INF
- GGML_OP_GET_ROWS
- GGML_OP_RMS_NORM
- GGML_OP_SILU
- GGML_OP_SOFT_MAX

add necessary ggml operations GGML_OP_ADD1, GGML_OP_SILU_BACK, GGML_OP_RMS_NORM_BACK, GGML_OP_DIAG_MASK_ZERO, and GGML_OP_ROPE_BACK

GGML_OP_ADD1 is necessary to add a scalar value in the backward pass of GGML_OP_SOFT_MAX
GGML_OP_ADD1 could also be replaced by using GGML_OP_ADD and GGML_OP_REPEAT, but the performance would be worse. additionally GGML_OP_REPEAT will return unexpected value when the the input to GGML_OP_SOFT_MAX contains only a single scalar. in this case GGML_OP_REPEAT will not return the value that should be repeated (src1) but the value which shape the result should take (src0). So in this case it can not replace GGML_OP_ADD1.

GGML_OP_SILU_BACK, GGML_OP_RMS_NORM_BACK and GGML_OP_ROPE_BACK are necessary for backward pass of GGML_OP_SILU, GGML_OP_RMS_NORM and GGML_OP_ROPE. The backward pass for these functions cannot be easily composed of existing operations. Since the backward pass builds a computation graph we need operations forward pass implementations of the the required backward passes. Sounds a bit confusing at first, I know...

GGML_OP_DIAG_MASK_ZERO is necessary for backward pass of GGML_OP_DIAG_MASK_INF.

Some operations where previously inplace-only. for backward pass there needs to be non-inplace variants.
staying consistent with other operations that have non-inplace and inplace variants, the operations are changed to non-inplace and
functions with "_inplace" are added which are inplace.
in llama we need to call the inplace variants so that it is implemented as before.
for llama backward pass we need to use the non-inplace variants.

still not completely implemented backward passes for llama:

- GGML_OP_ROPE: needs forward pass for GGML_OP_ROPE_BACK
- GGML_OP_GET_ROWS: only necessary for tokenizer

* norm & rms_norm can not be threaded:

after investigation rms norm for quite some time I come to the conclusion that neither norm, nor rms_norm can be threaded, because we need mean over all items, not just of the slices each thread sees.

* remove already resolved TODO

* implement backward pass of ggml_rope and ggml_rope_back

* implement backward pass for ggml_get_rows and for new operation ggml_get_rows_back

* add test-grad0.c

* use GGML_PRINT_DEBUG for debug messages which will otherwise flood the console

* test both gradients of mul_mat

* disable graph dot export as it floods console

* bug fixes for silu_back

* successfully test silu backward

* bug fix for scale backward pass

use sum instead of mean for gradient of scalar scale parameter

* successfully test scale backward

* improve performance of sum backward pass

use add1(x,y) instead of add(x,repeat(y,x))

* improve performance of sqr backward pass

use scale(x,y) instead of mul(x,repeat(y,x))

* successfully test rope backward

* bug fix for cpy backward pass

* successfully test cpy backward

* bug fix for reshape backward pass

* successfully test reshape backward

* add test-opt.c

this uses ggml_opt to train a,b for minimal e=sum(sqr(c - a*b)) for random initial a,b,c

* correctly implement softmax backward pass using new operation ggml_diag

ggml_diag constructs diagonal matrices with entries.
ggml_diag(shape[a,1,c,d]) -> shape[a,a,c,d]

* successfully test soft_max backward

* align shape annotations

* add shape annotations for llama

* de-duplicate ggml_forward_dup code taking care of contiguous tensors of same type.

with this we can duplicate tensor of any typ as long as they are contiguous.

* fix ggml_compute_forward_dup_same_cont for when nelements < nthreads

when more threads are used than elements exist ie1 was less than ie0, resulting in invalid negative byte count argument in memcpy

* bug fix for add_at forward

required for view backward pass

src0 values must be copied to dst, because during addition we don't touch all dst elements in contrast to the normal add function.

* successfully test view backward

* minor code format improvement

* fix ggml_forward_add functions to work correctly with transposed tensors

uses the same logic as in ggml_compute_forward_add_q_f32, but make it consistent across all ggml_compute_forward_add_... functions.
this also slightly changes the mem access pattern of the different threads to works as in ggml_compute_forward_add_q_f32.

* fix ggml_forward_add1 functions to work correctly with transposed tensors

uses the same logic as in ggml_compute_forward_add1_q_f32, but make it consistent across all ggml_compute_forward_add1_... functions.
this also slightly changes the mem access pattern of the different threads to works as in ggml_compute_forward_add1_q_f32.

* test-grad0.c : add print_elements to help with debugging

* successfully test permute backward

* some minor test-grad0 fixes

* fix sub, mul and div functions to work correctly with transposed tensors

uses the same logic as in add

* implement ggml_cont backward pass

* successfully test transpose backward and permute for all permutations

also test sub, mul and div up to max n_dims

* test-grad0.c add TODO for view_2d and view_3d

add_at (required for view backward pass) is a bit tricky for n_dims > 1.

* fix comments

* successfully test diag_mask_inf and diag_mask_zero backward

* test-grad0 : fix test for div

nargs and ndims was swapped, corrupting the stack

* fix diag_mask to work with non-inplace input

* move dup call into the actual add_at functions

* fix get rows backward pass

* successfully test get_rows backward

* fix view backward pass

add nb parameters to add_at like in view.
together with offset they define how to view dst and src0 during the add_at operation.

* successfully test backward pass of view_1d, view_2d and view_3d

* fix backward pass for rms_norm

I would have used formulas from other frameworks, but they differed so I could not decide which is correct.
Instead it was derived here in comment using manual forward-backward automatic differention of rms_norm and simplification.

* successfully test backward pass of rms_norm

some tests may fail when gradients are large.
could not find a satisfying configuration to check for abs error and relative error that passes all tests while still actually testing the results with tight enough error bounds.
when looking at the values the "failed" tests look actually ok. for example:

rms_norm: ndims=2, i=0, k=2, x0=0.000153, xm=0.000053, xp=0.000253, f0=0.278594, f1=0.086213, g0=961.905457, g1=966.064941, eps=0.000100, error_abs=4.159485, error_rel=0.004324

it is due to the test logic in check_gradients that they fail.

* add todos for llama backward pass

- implementation for ADD1 backward pass should probably use sum instead of mean (but this backward pass is not required)
- repeat is not yet tested and looks like it only works for single element src0 inputs.

* add operation ggml_sum_rows

ggml_sum_rows(shape[a,b,c,d]) -> shape[1,b,c,d]

* add missing GGML_OP_SUM_ROWS

* fix backward pass for repeat

requires ggml_sum_rows

* successfully test backward pass of repeat

* update quantization types in switch-case of add_at and add1

* add baby-llama example training a very small llama model from scratch to output a sinusoidal wave.

had to increase maximum number of optimization parameters to train from scratch.

* fix softmax in baby-llama example

* switching from training with adam to lbfgs produces much better results in the baby-llama example

* train with two examples, creating new tensors each time..

* fix bug when using ggml_opt to optimize params in one context and use a renewable context for eval and opt

when not keeping gradients of model parameters they are overwritten by tensors created by opt, which may be invalid after opt context is renewed.
so we need to keep the original gradients and make dups for opt

* train on multiple examples, generate & print tokens with trained model afterwards

ctx0 for evaluation and optimization is renewed for each sample

* add ggml_reshape_1d, ggml_reshape_4d and ggml_view_4d

* fix soft_max backward pass for input->ne[1] != 1

* add ggml_log operation necessary for cross entropy loss

* add test for ggml_log gradients

* implement backward pass for ggml_sum_rows, necessary for cross entropy loss

* implement ggml_repeat support for rank > 2 tensors

* add test for ggml_sum_rows gradients

* fix training get_example_targets

predict the next token, not the current token!

* add square_error_loss and cross_entropy_loss functions

* optimize loss over multiple samples

this increases computation graph, need parallel batched forward for more efficiency.

* fix backward pass for add_at and change arguments to have same order as in view

* add ggml_set(ctx, a, b) to set b in view of a and return modified a

necessary to set values into kv_self cache and properly propagate the gradients

* fix kv_self gradients for training

use ggml_set instead of ggml_cpy to set kv_self cache with properly propagating gradients

* replace inplace operations for training with copying operations to allow gradient propagation

* add GGML_ASSERT to catch ggml_rope and back value errors

* add trainable lora-only model with all big matrices C split into A,B with A*B=C

this is not a lora-finetune, but the whole model changed to have only low-rank "lora" matrices.

training this instead of the normal model resulted in much worse results though...

* vastly improve training results

instead of logit targets 0 and 1 use -1 and +1.

* shorten code using a variable

* change name of GGML_OP_ADD_AT to GGML_OP_ACC

* smaller default values for baby llama model parameters

* update static assert of GGML_OP_COUNT

* remove shape annotations in llama_eval_internal

* revert disabling of threading for rms_norm and norm

* rename print functions in baby-llama example

* fix call to ggml_set_name

* add missing include for strcmp, etc

* remove trailing whitespace

* reduce number of test-grad0 iterations

avoid exceeding timeout of automated tests

* remove busy loop that was used as sleep for slower sinus wave generation

* disable slow tests grad0 and opt to avoid exceeding timeouts

* c++ in baby-llama example

use c++ includes instead of c includes
use std::min, std::max instead of MIN, MAX macros

* c++ in baby-llama example

use c++ includes instead of c includes
use std::min, std::max instead of MIN, MAX macros

* ggml : fix compiler warnings + cosmetic changes

* ggml : fix nullptr derefs in GGML_OP_CONT and GGML_OP_RESHAPE back

* swap arguments to vDSP_vdiv call

documentation for vDSP_vdiv states: "Note that B comes before A!"

* swap arguments to vDSP_vdiv call

documentation for vDSP_vdiv states: "Note that B comes before A!"

* ggml : swap vDSP_vsub args as per documentation

* add parallel batched forward function for baby-llama training

* cleanup code for batched training

* remove trailing whitespace

* minor : fix compiler warnings + indentation style

* ggml : fix null ptr deref in backward pass

* ggml : remove Q4_2 remnants

* ggml : fix clang-tidy warnings

* baby-llama : couple of clang-tidy warnings

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-05-13 15:56:40 +03:00
Georgi Gerganov
f048af0230 ggml : sync alibi fix from ggml repo 2023-05-13 11:54:33 +03:00
3ooabkhxtn
ac0cd259d5 Adding SSE instructions to ggml_vec_dot_q4_0_q8_0 (#1413) 2023-05-13 08:43:33 +00:00
Georgi Gerganov
0cd22e190a llama : fix various warnings 2023-05-13 11:23:15 +03:00
Rinne
6456a4eb9f embedding : remove unused code (#1426) 2023-05-13 10:24:20 +03:00
Georgi Gerganov
cdd5350892 readme : update Q4_0 perplexities
I think these were affected by the removal of the `round` during quantization
2023-05-13 09:12:44 +03:00
Georgi Gerganov
738ace394a llama : free ggml context in set / copy state data (close #1425) 2023-05-13 09:08:52 +03:00
Henri Vasserman
699b1ad7fe opencl : fix kernels for the new formats (#1422)
* Fix OpenCL kernels for the new formats

* Fix Q5_0 alignment issues.
2023-05-13 09:01:15 +03:00
Georgi Gerganov
fb62f92433 llama : fix --mtest option (close #1414) 2023-05-12 21:44:20 +03:00
Johannes Gäßler
773ee249fb CLI args use - instead of _, backwards compatible (#1416) 2023-05-12 14:34:55 +00:00
slaren
553fd4d4b5 Add clang-tidy reviews to CI (#1407) 2023-05-12 15:40:53 +02:00
Rinne
089b1c93ba readme : add C#/.NET bindings repo (#1409) 2023-05-12 08:39:40 +03:00
Georgi Gerganov
b9fd7eee57 ggml : remove bit shuffling (#1405)
* ggml : remove Q4_0 bit shufling (ARM NEON)

* ggml : remove Q4_1 bit shuffling (ARM NEON + reference)

* ggml : nibbles_from_floats() + bytes_from_nibbles() (ARM NEON)

* ggml : remove Q4_2 bit shuffling (WIP, BROKEN)

* ggml : remove Q5_0 bit shuffling (ARM NEON)

* ggml : 2x faster scalar implementations

* ggml : remove Q5_1 bit shuffling (ARM NEON + scalar)

* ggml : simplify scalar dot

* ggml : remove WASM SIMD bit shuffling + remove vzip for ARM 32-bit

* ggml : fix Q4_1 quantization

* ggml : update cuBLAS + normalize variable names

* ggml : remove Q4_2 mode

* ggml : minor formatting

* ggml : fix Q5_0 quantization

* scripts : add script for measuring the time per token

* AVX implementations (#1370)

* ggml : uniform 5th bit extraction

* llama : produce error upon loading old model files

* llama : fix model magic/version write

* ggml : speed-up Q5_0 + Q5_1 at 4 threads

* ggml : preserve old Q4 and Q5 formats

* ggml : simplify Q8_1 - no need for low / high sums anymore

* ggml : fix Q8_0 and Q8_1 rounding

* Revert "AVX implementations (#1370)"

This reverts commit 948d124837.

* ggml : fix AVX2 implementation

* sha : update hashes for 7B and 13B

* readme : update timings + remove warning banner

* llama : update v2 PR number to 1405

* ggml : fix WASM comments

* ggml : back to original bit order

* readme : add note that Q4 and Q5 have been changed

* llama : fix return for unknown version

---------

Co-authored-by: Stephan Walter <stephan@walter.name>
2023-05-12 00:23:08 +03:00
CRD716
b608b55a3e prompts : model agnostic DAN (#1304)
* add model-agnostic dan prompt

* quick readme update

* save a token

* Revert "quick readme update"

This reverts commit 8dc342c069.
2023-05-11 18:10:19 +03:00
Evan Jones
cf348a60e0 main : add option to save full output to session (#1338)
* main : add option to save full output to session

* split behavior into --session and --prompt-cache

* restore original implementation with new names

* PR comments

* move the check for incompatible parameters to gpt_params_parse

* Fix whitespace

Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>

---------

Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
2023-05-10 11:37:14 -04:00
DannyDaemonic
e6a46b0ed1 Locale fix for Windows (#1379) 2023-05-09 19:53:28 +02:00
Sami Farin
9f8dbc4787 use pause asm insn in busyloop to run the CPU (13600K) 10 °C cooler (#1314)
* use pause asm insn in busyloop to run the CPU (13600K) 10 °C cooler

Tested with a 13B model.

* use _mm_pause() in busyloop

* use _mm_pause() in busyloop on x86_64 to reduce power consumption
2023-05-09 14:29:20 +02:00
DannyDaemonic
41654efea8 Interface improvements and --multiline-input (previously --author-mode) (#1040)
* Interface improvements
* Multiline input
* Track character width
* Works with all characters and control codes + Windows console fixes
2023-05-08 19:45:48 -07:00
Georgi Gerganov
56551bc11f readme : add notice about upcoming breaking change 2023-05-08 22:52:18 +03:00
AlpinDale
fe60904eef readme : add TOC and Pygmalion instructions (#1359) 2023-05-08 19:33:30 +03:00
Pavol Rusnak
003ba2fb43 llama : fix hparams shadow (#1367)
fixes #1363
2023-05-08 17:48:21 +03:00
Georgi Gerganov
f9a6364912 llama : require first token to be BOS (#1303)
* llama : require first token to be BOS

* scripts : add ppl-run-all.sh

* perplexity : add BOS for each chunk

* readme : update perplexity values after BOS fix

* perplexity : add clarifying comments
2023-05-08 17:41:54 +03:00
ubik2
95078cc554 convert: add ability to convert safetensors files (#1276)
* when loading a safetensors file, ignore the metadata header
* check for safetensors files first, and only use PyTorch versions when safetensors aren't available
2023-05-08 13:54:26 +02:00
Johannes Gäßler
1f48b0abcf Documented CUDA reproducibility, added warning (#1346) 2023-05-08 02:42:01 +02:00
Henri Vasserman
e1295513a4 CI: add Windows CLBlast and OpenBLAS builds (#1277)
* Add OpenCL and CLBlast support

* Add OpenBLAS support

* Remove testing from matrix

* change build name to 'clblast'
2023-05-07 13:20:09 +02:00
swittk
1b0fd45465 ggml : Allow usage of CLBlast alongside Accelerate.framework (#1336)
Minor edit in ggml.c which originally would prevent OpenCL from loading completely if GGML_USE_ACCELERATE was defined.
Minor speedup in prompt eval time.
2023-05-06 23:03:23 -04:00
Jed Fox
3924088512 Remove default arguments from sampling functions (#1343) 2023-05-06 17:01:47 -04:00
DaniAndTheWeb
173d0e6419 makefile: automatic Arch Linux detection (#1332)
This commit is a port of a detection method used in koboldcpp's Makefile in order to automatically set the -lcblas option on Arch Linux
2023-05-05 23:57:14 +02:00
Erik Scholz
a3b85b28da ci : add cublas to windows release (#1271) 2023-05-05 22:56:09 +02:00
Pavol Rusnak
921dcee00a readme: add missing info (#1324) 2023-05-05 16:43:36 +02:00
Ionoclast Laboratories
2d13786e91 Fix for OpenCL / clbast builds on macOS. (#1329) 2023-05-05 14:18:21 +02:00
Benjamin Lecaillon
a90e96b266 Convert.py @staticmethod (#1327)
* Line 698 has one #staticmethod and should not

otherwise throw error at unpickle.load() as not callable

* Update convert.py

---------

Co-authored-by: Ivan Stepanov <ivanstepanovftw@gmail.com>
2023-05-05 03:17:07 +03:00
slaren
94c5652fc0 quantize: make output filename optional, default to ggml-model-<ftype>.bin (#1301) 2023-05-05 00:58:56 +02:00
Ivan Stepanov
34d9f22f44 Wrap exceptions in std::exception to verbose output on exception. (#1316) 2023-05-04 18:56:27 +02:00
Ivan Stepanov
d3e8093e9b convert: support DT_BF16 tensors (#1309)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io>
2023-05-04 18:54:37 +02:00
44670
360cfe5bec readme : add OpenBuddy link (#1321) 2023-05-04 19:33:31 +03:00
44670
2edbdb0f99 main : add --in-suffix option (#1318)
* adding --in-suffix option

* print input suffix before generation
2023-05-04 18:41:12 +03:00
Ron Jailall
20fbf2a2a0 ggml : change immintrin.h to intrin.h for compatibility (#1307)
* change immintrin.h to intrin.h for compatibility

Building on windows11 arm throws an error on this line. Seems like using intrin.h covers x86 and and arm

* conditional def of intrin.h

* fix typo in ggml.c
2023-05-04 18:05:59 +03:00
DannyDaemonic
db1080876a Only escape prompts when used with -e (#1311) 2023-05-04 05:08:25 -07:00
DannyDaemonic
c65a7fbfa9 Update main's README.md with new features (#1296) 2023-05-04 03:02:59 -07:00
Tomas
f647ce040f fix #1224 reverse prompt and multi line (#1297)
* fix reverse prompt and multi line

* Code Formatting

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-05-04 03:02:30 -07:00
Georgi Gerganov
799fdc1b5d ggml : vectorize Q8_0 quantization
https://github.com/ggerganov/ggml/pull/127#issuecomment-1533648531
2023-05-03 23:24:20 +03:00
khimaros
6daa09d879 examples : read chat prompts from a template file (#1196) 2023-05-03 20:58:11 +03:00
Georgi Gerganov
bca9ad938a minor : fix whitespaces (#1302) 2023-05-03 20:09:42 +03:00
Georgi Gerganov
e2a937ca6a minor : fix trailing whitespaces 2023-05-03 18:43:23 +03:00
KASR
b0c71c7b6d scripts : platform independent script to verify sha256 checksums (#1203)
* python script to verify the checksum of the llama models

Added Python script for verifying SHA256 checksums of files in a directory, which can run on multiple platforms. Improved the formatting of the output results for better readability.

* Update README.md

update to the readme for improved readability and to explain the usage of the python checksum verification script

* update the verification script

I've extended the script based on suggestions by @prusnak

The script now checks the available RAM, is there is enough to check the file at once it will do so. If not the file is read in chunks.

* minor improvment

small change so that the available ram is checked and not the total ram

* remove the part of the code that reads the file at once if enough ram is available

based on suggestions from @prusnak i removed the part of the code that checks whether the user had enough ram to read the entire model at once. the file is now always read in chunks.

* Update verify-checksum-models.py

quick fix to pass the git check
2023-05-03 18:31:28 +03:00
CRD716
a8a2efdc81 examples : various prompt and example fixes (#1298)
* fix dan.txt

* miku prompt improvements

* use common characters
2023-05-03 18:26:47 +03:00
Evan Jones
e216aa0463 llama : only copy used KV cache in get / set state (#1272)
* llama : only copy used KV cache in get / set state

* switch to ggml for copying k, v

* avoid designated initializers
2023-05-02 22:26:13 -04:00
DannyDaemonic
2485d7a4d3 Process escape sequences given in prompts (#1173) 2023-05-02 18:46:20 -07:00
DannyDaemonic
13b0c68ed7 Handle signals properly on Windows (#1123) 2023-05-02 18:01:57 -07:00
DannyDaemonic
55bc5f0900 Call sh on build-info.sh (#1294) 2023-05-02 17:52:35 -07:00
kuvaus
9daff419f6 fix build-info.h for git submodules (#1289)
* make git build info work with submodules

---------

Co-authored-by: Green Sky <green@g-s.xyz>
2023-05-03 02:43:43 +02:00
slaren
bf4b22ffe4 fix missing parameters in llama_init_from_gpt_params (#1293) 2023-05-03 01:36:45 +02:00
Ron Evans
67c77799e0 examples : add llama_init_from_gpt_params() common function (#1290)
Signed-off-by: deadprogram <ron@hybridgroup.com>
2023-05-02 23:39:51 +03:00
Georgi Gerganov
0e6cbff1b7 llama : fix compile warnings 2023-05-02 23:09:08 +03:00
Georgi Gerganov
5d5817ca60 ggml : fix 32-bit ARM 2023-05-02 22:14:50 +03:00
Ron Evans
8c9be35ff9 examples : improve vertical alignment of a few variables (#1286)
Signed-off-by: deadprogram <ron@hybridgroup.com>
2023-05-02 20:53:52 +03:00
Marvin Gießing
cc0bb7235c ggml : fix ppc64le build error and make cmake detect Power processors (#1284)
* Fix ppc64le build issue

* Added support to detect ppc64* processors
2023-05-02 19:42:16 +03:00
Robert Brisita
2bb992f034 llama : allow 0 as a seed number. (#1275) 2023-05-02 19:23:44 +03:00
Ron Evans
e2cd506999 main : switch input_noecho to input_echo to remove negation (#979)
Signed-off-by: deadprogram <ron@hybridgroup.com>
2023-05-02 19:13:26 +03:00
slaren
2d099e5193 ggml: add names to tensors (#1268)
* ggml: add names to tensors

* minor improvements to dot file formatting
2023-05-02 16:03:00 +02:00
DannyDaemonic
f4cef87edf Add git-based build information for better issue tracking (#1232)
* Add git-based build information for better issue tracking

* macOS fix

* "build (hash)" and "CMAKE_SOURCE_DIR" changes

* Redo "CMAKE_CURRENT_SOURCE_DIR" and clearer build messages

* Fix conditional dependency on missing target

* Broke out build-info.cmake, added find_package fallback, and added build into to all examples, added dependencies to Makefile

* 4 space indenting for cmake, attempt to clean up my mess in Makefile

* Short hash, less fancy Makefile, and don't modify build-info.h if it wouldn't change it
2023-05-01 18:23:47 +02:00
slaren
58b367c2d7 cuBLAS: refactor and optimize f16 mat mul performance (#1259)
* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
2023-05-01 18:11:07 +02:00
xloem
ea3a0ad6b6 llama : update stubs for systems without mmap and mlock (#1266)
Co-authored-by: John Doe <john.doe@example.com>
2023-05-01 15:58:51 +03:00
Kerfuffle
2bdc09646d ggml : fix ggml_used_mem() (#1264) 2023-05-01 14:56:07 +03:00
Georgi Gerganov
70269cae37 llama : fix session load / save (#1263) 2023-05-01 14:54:59 +03:00
slaren
b925f1f1b0 cuBLAS: fall back to pageable memory if pinned alloc fails (#1233)
* cuBLAS: fall back to pageable memory if pinned alloc fails

* cuBLAS: do not use pinned memory if env variable GGML_CUDA_NO_PINNED is set
2023-05-01 13:32:22 +02:00
Alex Klinkhamer
90b19bd6ee llama : let context be const when accessing const data (#1261) 2023-05-01 10:24:20 +03:00
Georgi Gerganov
7ff0dcd320 ggml : fix UB (int << 31) 2023-04-30 22:28:51 +03:00
Pavol Rusnak
6f79699286 build: add armv{6,7,8} support to cmake (#1251)
- flags copied from Makefile
- updated comments in both CMakeLists.txt and Makefile to match reality
2023-04-30 20:48:38 +02:00
jon-chuang
a5d30b1f53 common : better default number of threads (#934)
* commit

* fix

* try-catch

* apply code review

* improve

* improve

* add macos headers

* done

* remove color

* fix windows

* minor

* fix

* Apply suggestions from code review

Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>

* remove

* minor

* minor

---------

Co-authored-by: jon-chuang <jon-chuang@users.noreply.github.com>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
2023-04-30 21:41:35 +03:00
0cc4m
76a884920a ggml : add CLBlast q5_0, q5_1, q8_0 dequant kernels (#1225)
* Implement q5_0, q5_1 and q8_0

* Work around q5_0 OpenCL issue

* Fix q8_0 dequant kernel

* Move cl kernels into ggml-opencl.c

* Use two memcpy calls for q5_0 buffer transfer
2023-04-30 21:34:52 +03:00
Georgi Gerganov
6bc4400e67 ggml : add Q5 WASM SIMD + GGML_FTYPE 2023-04-30 19:07:43 +03:00
Stephan Walter
f0d70f147d Various fixes to mat_mul benchmark (#1253) 2023-04-30 12:32:37 +00:00
Georgi Gerganov
3e5aa8a1c4 ggml : fix labels for GGML_OP_ALIBI 2023-04-30 10:25:46 +03:00
Georgi Gerganov
c3ca7a5f05 ggml : fix 32-bit ARM NEON 2023-04-29 21:34:23 +03:00
Georgi Gerganov
e8c051611a ggml : use vzip instead of vuzp for consistency 2023-04-29 21:12:56 +03:00
Georgi Gerganov
0b5a935099 ggml : fix visibility and unused warnings 2023-04-29 19:28:36 +03:00
Georgi Gerganov
ec728e44d7 ggml : fix #if for f32_f32 mul_mat (CLBlast) (#1229) 2023-04-29 18:43:42 +03:00
Georgi Gerganov
214b6a3570 ggml : adjust mul_mat_f16 work memory (#1226)
* llama : minor - remove explicity int64_t cast

* ggml : reduce memory buffer for F16 mul_mat when not using cuBLAS

* ggml : add asserts to guard for incorrect wsize
2023-04-29 18:43:28 +03:00
Georgi Gerganov
305eb5afd5 build : fix reference to old llama_util.h 2023-04-29 13:53:12 +03:00
Georgi Gerganov
84ca9c2ecf examples : fix save-load-state + rename llama-util.h 2023-04-29 13:48:11 +03:00
Georgi Gerganov
334637e43e common : change default parameters to pre-#1126 (#1223) 2023-04-29 09:51:06 +03:00
Ivan Stepanov
dd7eff57d8 llama : new sampling algorithms (#1126)
* Sample interface, new samplers.

New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat

Ignore EOS fix: -inf should be used.

* mirostat

* Added --logit-bias and --no-penalize-nl, removed std::span

* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)

Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)

* Save and load example adjust

* Tests

* Windows build fix

* Windows test fix
2023-04-29 08:34:41 +03:00
slaren
7fc50c051a cuBLAS: use host pinned memory and dequantize while copying (#1207)
* cuBLAS: dequantize simultaneously while copying memory

* cuBLAS: use host pinned memory

* cuBLAS: improve ggml_compute_forward_mul_mat_f16_f32 with pinned memory

* cuBLAS: also pin kv cache

* fix rebase
2023-04-29 02:04:18 +02:00
Henri Vasserman
b1ee8f59b4 cuBLAS: non-contiguous tensor support (#1215)
* Cuda: non-contiguous tensor support

* remove extra stuff

* rename

* fix error

* more fixes, now OpenBLAS and CLBlast build too

* now then?
2023-04-29 01:31:56 +02:00
Stephan Walter
36d19a603b Remove Q4_3 which is no better than Q5 (#1218) 2023-04-28 23:10:43 +00:00
Georgi Gerganov
7f15c5c477 readme : update hot topics 2023-04-28 21:32:52 +03:00
Georgi Gerganov
55390bcaf2 ggml : sync ggml (ggml_alibi) 2023-04-28 20:51:05 +03:00
CRD716
5fba3c016b examples : add Jeopardy example (#1168)
* Basic Setup

* Prevent Results.txt from coming up

* Prefixes, Line separators, etc

* editorcheck

* introduction to give more consistent results

* Basic graph thing

* Grading, ready for testing!

* Y'all ready to get funky?

* fix column removal stuff

* missed a few
2023-04-28 19:13:33 +03:00
Evan Jones
1481a9cf25 llama : add session file format and saved sessions in main (#1169) 2023-04-28 18:59:37 +03:00
Georgi Gerganov
11d902364b ggml : add helper debug printf in soft_max 2023-04-28 17:59:08 +03:00
0cc4m
7296c961d9 ggml : add CLBlast support (#1164)
* Allow use of OpenCL GPU-based BLAS using ClBlast instead of OpenBLAS for context processing

* Improve ClBlast implementation, avoid recreating buffers, remove redundant transfers

* Finish merge of ClBlast support

* Move CLBlast implementation to separate file

Add buffer reuse code (adapted from slaren's cuda implementation)

* Add q4_2 and q4_3 CLBlast support, improve code

* Double CLBlast speed by disabling OpenBLAS thread workaround

Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <2141330+slaren@users.noreply.github.com>

* Fix device selection env variable names

* Fix cast in opencl kernels

* Add CLBlast to CMakeLists.txt

* Replace buffer pool with static buffers a, b, qb, c

Fix compile warnings

* Fix typos, use GGML_TYPE defines, improve code

* Improve btype dequant kernel selection code, add error if type is unsupported

* Improve code quality

* Move internal stuff out of header
* Use internal enums instead of CLBlast enums
* Remove leftover C++ includes and defines
* Make event use easier to read

Co-authored-by: Henri Vasserman <henv@hot.ee>

* Use c compiler for opencl files

* Simplify code, fix include

* First check error, then release event

* Make globals static, fix indentation

* Rename dequant kernels file to conform with other file names

* Fix import cl file name

---------

Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <2141330+slaren@users.noreply.github.com>
Co-authored-by: Henri Vasserman <henv@hot.ee>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-04-28 17:57:16 +03:00
Folko-Ven
78ec543733 Correcting link to w64devkit (#1214)
Correcting link to w64devkit (change seeto to skeeto).
2023-04-28 16:22:48 +02:00
Johannes Gäßler
92a6e13a31 Add Manjaro CUDA include and lib dirs to Makefile (#1212) 2023-04-28 15:40:32 +02:00
Yann Follet
04aaae1d79 add avx2 for dot_q8_0_q8_0, 2x faster than scalar (#1211) 2023-04-28 11:59:48 +00:00
Stephan Walter
0b2da20538 ggml : slightly faster AVX2 implementation for Q5 (#1197) 2023-04-26 23:26:42 +03:00
Georgi Gerganov
f9be42add0 readme : add quantization info 2023-04-26 23:24:42 +03:00
Georgi Gerganov
574406dc7e ggml : add Q5_0 and Q5_1 quantization (#1187)
* ggml : add Q5_0 quantization (cuBLAS only)

* ggml : fix Q5_0 qh -> uint32_t

* ggml : fix q5_0 histogram stats

* ggml : q5_0 scalar dot product

* ggml : q5_0 ARM NEON dot

* ggml : q5_0 more efficient ARM NEON using uint64_t masks

* ggml : rename Q5_0 -> Q5_1

* ggml : adding Q5_0 mode

* quantize : add Q5_0 and Q5_1 to map

* ggml : AVX2 optimizations for Q5_0, Q5_1 (#1195)

---------

Co-authored-by: Stephan Walter <stephan@walter.name>
2023-04-26 23:14:13 +03:00
Ásgeir Bjarni Ingvarsson
87a6f846d3 Allow setting the rng seed after initialization. (#1184)
The llama_set_state_data function restores the rng state to what it
was at the time llama_copy_state_data was called. But users may want
to restore the state and proceed with a different seed.
2023-04-26 22:08:43 +02:00
DaniAndTheWeb
ea3ad7eb60 Updating build instructions to include BLAS support (#1183)
* Updated build information

First update to the build instructions to include BLAS.

* Update README.md

* Update information about BLAS

* Better BLAS explanation

Adding a clearer BLAS explanation and adding a link to download the CUDA toolkit.

* Better BLAS explanation

* BLAS for Mac

Specifying that BLAS is already supported on Macs using the Accelerate Framework.

* Clarify the effect of BLAS

* Windows Make instructions

Added the instructions to build with Make on Windows

* Fixing typo

* Fix trailing whitespace
2023-04-26 22:03:03 +02:00
Pavol Rusnak
859fee6dfb quantize : use map to assign quantization type from string (#1191)
instead of `int` (while `int` option still being supported)

This allows the following usage:

`./quantize ggml-model-f16.bin ggml-model-q4_0.bin q4_0`

instead of:

`./quantize ggml-model-f16.bin ggml-model-q4_0.bin 2`
2023-04-26 18:43:27 +02:00
Stephan Walter
4afcc37869 Update SHA256SUMS after quantization change (#1181)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io>
2023-04-25 23:41:56 +02:00
ostix360
667c501334 py : cast lora_alpha to int in convert-lora-to-ggml (#1170)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io>
2023-04-25 23:33:08 +02:00
Pavol Rusnak
bb98e77be7 nix: use convert.py instead of legacy wrapper convert-pth-to-ggml.py (#981) 2023-04-25 23:19:57 +02:00
Georgi Gerganov
7a32fcb3b2 ggml : add Q8_0 quantization format (rename the old one to Q8_1) (ARM NEON) (#1179)
* ggml : add Q8_0 quantization format (rename the old one to Q8_1)

* tests : fix test-quantize-fns

* ggml : finalize Q8_0 implementation

* ggml : use q4_0_q8_0 and q4_2_q8_0

* ggml : fix Q8_0 dot product bug (ARM)

* ggml : Q8_0 unroll x2

* ggml : fix bug - using wrong block type

* ggml : extend quantize_fns_t with "vec_dot_type"

* ggml : fix Q8_0 to use 255 values out of 256

* ggml : fix assert using wrong QK4_2 instead of QK4_3
2023-04-25 23:40:51 +03:00
unbounded
dd0eabc049 ggml : use full range for Q4_0 and Q4_2 quantization (#729)
* Use full range for q4_0 quantization

By keeping the sign of the highest magnitude, we can make sure the
highest value maps to -8, which is currently unused.
This is a bit of a freebie since it is fully backwards compatible with
the current format.

* Update quantize_row_q4_0 for AVX/AVX2

* Update quantize_row_q4_0 for WASM

Untested

* Update quantize_row_q4_0 for Arm NEON

* Update quantize_row_q4_0 for PowerPC

Untested

* Use full range for q4_2 quantization
2023-04-25 20:20:46 +03:00
xaedes
54bb60e268 ggml : fix bug in ggml_compute_forward_sum_f32 (#1162)
The sum over all rows is now computed instead of just the last row
2023-04-24 23:02:02 +02:00
Georgi Gerganov
8a0f8673ba ggml : export symbols (#1155) 2023-04-24 22:18:25 +03:00
xaedes
0c5692345d examples : add save_load_state example (#1150)
* add save_load_state example

* use <cstdio> instead of <iostream> and fprintf / printf instead of cout

* renamed save-load-state example files replacing underscores by dashes
2023-04-24 19:23:31 +03:00
Georgi Gerganov
957c8ae21d llama : increase scratch buffer size for 65B (ref #1152)
Temporary solution
2023-04-24 18:47:30 +03:00
mgroeber9110
9b0a4d4214 examples/main README improvements and some light refactoring (#1131) 2023-04-24 15:45:32 +00:00
Stephan Walter
2ec83428de Fix build for gcc 8 and test in CI (#1154) 2023-04-24 15:38:26 +00:00
slaren
e4cf982e0d Fix cuda compilation (#1128)
* Fix: Issue with CUBLAS compilation error due to missing -fPIC flag

---------

Co-authored-by: B1gM8c <89020353+B1gM8c@users.noreply.github.com>
2023-04-24 17:29:58 +02:00
Georgi Gerganov
c4fe84fb0d llama : refactor get / set state + remove redundant kv cache API (#1143) 2023-04-24 07:40:02 +03:00
slaren
1d78fecdab Fix LoRA acronym (#1145) 2023-04-23 23:03:44 +02:00
Georgi Gerganov
284685f169 scripts : add helper scripts to synch ggml repo 2023-04-23 19:57:09 +03:00
DannyDaemonic
edce63baa9 Added README.md for main with examples and explanations (#1139) 2023-04-23 15:37:02 +00:00
Georgi Gerganov
ec9cdb6752 ggml : do not print perf ops that have not been used at all 2023-04-23 18:32:52 +03:00
Georgi Gerganov
e4422e299c ggml : better PERF prints + support "LLAMA_PERF=1 make" 2023-04-23 18:15:39 +03:00
Stephan Walter
53c8434398 Improve AVX2 for vec_dot_q4_3_q8_0 (#1138) 2023-04-23 11:01:03 +00:00
Pavol Rusnak
c6524f46eb readme : update gpt4all instructions (#980) 2023-04-23 10:21:26 +02:00
Yishuo Wang
c9e2c26f41 A better packNibbles and mul_sum_i8_pairs_float implementation using AVX512 (#1119) 2023-04-23 07:57:05 +00:00
Georgi Gerganov
0e018fe008 ggml : fix Q4_3 cuBLAS 2023-04-22 16:32:07 +03:00
Stephan Walter
857308d1e8 ci : trigger CI for drafts, but not most PR actions (#1125) 2023-04-22 16:12:29 +03:00
Stephan Walter
c50b628810 Fix CI: ARM NEON, quantization unit tests, editorconfig (#1122) 2023-04-22 10:54:13 +00:00
unbounded
5f939498d5 ggml : unit test for quantization functions (#953)
* Unit test for quantization functions

Use the ggml_internal_get_quantize_fn function to loop through all
quantization formats and run a sanity check on the result.

Also add a microbenchmark that times these functions directly without
running the rest of the GGML graph.

* test-quantize-fns: CI fixes

Fix issues uncovered in CI
 - need to use sizes divisible by 32*8 for loop unrolling
 - use intrinsic header that should work on Mac

* test-quantize: remove

Per PR comment, subsumed by test-quantize-fns

* test-quantize: fix for q8_0 intermediates
2023-04-22 12:10:39 +03:00
wbpxre150
36b4f7e064 llama : print timings on ctrl+c exit (#1021)
* print timings on ctrl+c exit

* remove redundant free memory call.

* add global pointer to ctx.
2023-04-22 11:56:35 +03:00
eiery
10f19c1121 llama : have n_batch default to 512 (#1091)
* set default n_batch to 512 when using BLAS

* spacing

* alternate implementation of setting different n_batch for BLAS

* set n_batch to 512 for all cases
2023-04-22 11:27:05 +03:00
Howard Su
7e312f165c cmake : fix build under Windows when enable BUILD_SHARED_LIBS (#1100)
* Fix build under Windows when enable BUILD_SHARED_LIBS

* Make AVX512 test on Windows to build the shared libs
2023-04-22 11:18:20 +03:00
Georgi Gerganov
872c365a91 ggml : fix AVX build + update to new Q8_0 format 2023-04-22 11:08:12 +03:00
Georgi Gerganov
955ef9a5d5 ggml : alternative Q4_3 implementation using modified Q8_0 (#1109)
* ggml : prefer vzip to vuzp

This way we always use the same type of instruction across all quantizations

* ggml : alternative Q4_3 implementation using modified Q8_0

* ggml : fix Q4_3 scalar imlpementation

* ggml : slight improvement of Q4_3 - no need for loop unrolling

* ggml : fix AVX paths for Q8_0 quantization
2023-04-22 10:55:35 +03:00
Stephan Walter
c5aa5e5777 ggml : AVX2 optimization for vec_dot_q4_3_q8_0 and refactoring (#1099)
* AVX2 optimization for vec_dot_q4_3_q8_0 and refactoring

* finish AVX vectorization of quantize_row_q8_0

* Rename hsum_int_8 to hsum_i32_8
2023-04-22 10:37:05 +03:00
Clint Herron
e9a9cb0c54 examples : Improve Alpaca Default Repeat Penalty: Better Match Alpaca.cpp Experience (#1107)
* Moving parameters to separate lines for readability.

* Increasing repeate_penalty to 1.1 to make alpaca more usable by default.

* Adding trailing newline.
2023-04-22 09:54:33 +03:00
xaedes
b6e7f9b09e llama : add api for getting/setting the complete state: rng, logits, embedding and kv_cache (#1105)
* reserve correct size for logits

* add functions to get and set the whole llama state:

including rng, logits, embedding and kv_cache

* remove unused variables

* remove trailing whitespace

* fix comment
2023-04-22 09:21:32 +03:00
slaren
50cb666b8a Improve cuBLAS performance by using a memory pool (#1094)
* Improve cuBLAS performance by using a memory pool

* Move cuda specific definitions to ggml-cuda.h/cu

* Add CXX flags to nvcc

* Change memory pool synchronization mechanism to a spin lock
General code cleanup
2023-04-21 21:59:17 +02:00
apaz
25d7abbd1f llama : fixed rlimit error message (#888) 2023-04-21 21:48:06 +03:00
源文雨
018f2279f5 cmake : link threads publicly to ggml (#1042)
* fix: ld link test-tokenizer-0 error

```
cmake3 --build . --config Release
[  5%] Built target ggml
[ 16%] Built target llama
[ 22%] Linking CXX executable ../bin/test-tokenizer-0
../libllama.a(ggml.c.o):在函数‘ggml_graph_compute’中:
ggml.c:(.text+0xf2db):对‘pthread_create’未定义的引用
ggml.c:(.text+0xf9d4):对‘pthread_join’未定义的引用
collect2: error: ld returned 1 exit status
gmake[2]: *** [bin/test-tokenizer-0] 错误 1
gmake[1]: *** [tests/CMakeFiles/test-tokenizer-0.dir/all] 错误 2
gmake: *** [all] 错误 2
```

* Update CMakeLists.txt

* Update CMakeLists.txt

* Update CMakeLists.txt
2023-04-21 21:27:06 +03:00
Alex Klinkhamer
9411288271 main : evaluate tokens in batches after swapping context (#1014)
* examples : evaluate tokens in batches after swapping context

* Update examples/main/main.cpp

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-04-21 21:18:09 +03:00
xaedes
8687c1f258 llama : remember and restore kv cache data pointers (#1104)
because their value is stored in buf and overwritten by memcpy
2023-04-21 18:25:21 +03:00
Kawrakow
1bfc153e2f ggml : a faster version for Q4_1 x Q8_0 dot products (#1083)
* A faster version for Q4_1 x Q8_0 dot products

The idea nehind being that Q8_0 quantized
values get used many times in the matrix multiplications
where they are involved. In the current implementations,
when we are evaluating the dot products, we need to compute
the sum of the quants in the Q8_0 vector, so the same
operation is repeated many times. Here we pre-compute
the sum during Q8_0 quantization, store it in the
now modified block_q8_0 struct, and then reuse this
result in the subsequent dot products.

In a synthetic benchmark (just compute a bunch of dot
products), this change speeds up the Q4_1 * Q8_0 dot
product by 80%, making the performance identical to
Q4_0 * Q8_0.

In practical application, I see a ~15% gain in speed for
token prediction on M2, and ~5% gain on Ryzen 7950X.
The speed gain in the prompt evaluation is much bigger
(around 50%).

I have only done the change for the scalar version,
ARM_NEON, and AVX2, so we still need an AVX implementation.

* Cleaning up

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-04-21 18:18:26 +03:00
slaren
3d59769c3b Show perplexity ETA in hours and minutes (#1096) 2023-04-21 14:57:57 +02:00
Georgi Gerganov
d40fded93e llama : fix comment for "output.weight" tensor 2023-04-21 10:24:02 +03:00
Stephan Walter
2510c1831f Add ggml-model-*.bin checksums for 7B, 13B, 30B, 65B (#1088)
* Add ggml-model-*.bin checksums for 7B, 13B, 30B
* Add ggml-model-*.bin checksums for 65B

---------

Co-authored-by: Pavol Rusnak <pavol@rusnak.io>
2023-04-20 23:56:44 +02:00
Georgi Gerganov
12b5900dbc ggml : sync ggml (add GPT-NeoX RoPE implementation) 2023-04-20 23:32:59 +03:00
Georgi Gerganov
9ff334f3c9 ggml : fix bug in ggml_compute_forward_dup_f32() 2023-04-20 21:58:38 +03:00
slaren
2005469ea1 Add Q4_3 support to cuBLAS (#1086) 2023-04-20 20:49:53 +02:00
Georgi Gerganov
8a1756abdf ggml : do not break cuBLAS build (Q4_3 is not yet implemented) 2023-04-20 21:43:50 +03:00
Georgi Gerganov
66aab46079 ggml : fix Q4_3 quantization
Broke it during conflict resolution in last PR
2023-04-20 20:44:05 +03:00
Kawrakow
38de86a711 llama : multi-threaded quantization (#1075)
* Multi-threading quantization.

Not much gain for simple quantizations, bit it will be important
for quantizations that require more CPU cycles.

* Multi-threading for quantize-stats

It now does the job in ~14 seconds on my Mac for
Q4_0, Q4_1 and Q4_2. Single-threaded it was taking
more than 2 minutes after adding the more elaborate
version of Q4_2.

* Reviewer comments

* Avoiding compiler confusion

After changing chunk_size to const int as suggested by
@ggerganov, clang and GCC starting to warn me that I don't
need to capture it in the lambda. So, I removed it from the
capture list. But that makes the MSVC build fail. So,
making it a constexpr to make every compiler happy.

* Still fighting with lambda captures in MSVC

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-04-20 20:42:27 +03:00
Georgi Gerganov
e0305ead3a ggml : add Q4_3 quantization (#1082) 2023-04-20 20:35:53 +03:00
Ivan Komarov
6a9661ea5a ci : remove the LLAMA_ACCELERATE matrix dimension from Ubuntu builds in the CI (#1074)
[Accelerate](https://developer.apple.com/documentation/accelerate) is an Apple framework which can only be used on macOS, and the CMake build [ignores](https://github.com/ggerganov/llama.cpp/blob/master/CMakeLists.txt#L102) the `LLAMA_ACCELERATE` variable when run on non-Apple platforms. This implies setting `LLAMA_ACCELERATE` is a no-op on Ubuntu and can be removed.

This will reduce visual noise in CI check results (in addition to reducing the number of checks we have to run for every PR). Right now every sanitized build is duplicated twice for no good reason (e.g., we have `CI / ubuntu-latest-cmake-sanitizer (ADDRESS, Debug, ON)` and `CI / ubuntu-latest-cmake-sanitizer (ADDRESS, Debug, OFF)`).
2023-04-20 18:15:18 +03:00
源文雨
5addcb120c fix: LLAMA_CUBLAS=1 undefined reference 'shm_open' (#1080) 2023-04-20 15:28:43 +02:00
Stephan Walter
c8c2c52482 AVX2 optimization for vec_dot_q4_2_q8_0 (#1068) 2023-04-20 08:45:41 +02:00
slaren
02d6988121 Improve cuBLAS performance by dequantizing on the GPU (#1065) 2023-04-20 03:14:14 +02:00
CRD716
834695fe3a Minor: Readme fixed grammar, spelling, and misc updates (#1071) 2023-04-19 19:52:14 +00:00
Kawrakow
f7d05095b4 Q4_2 quantization with rmse-optimized scale and quants (#1062)
* Q4_2 quantization with rmse-optimized scale and quants

For quantize-stats we get
q4_2: rmse 0.00159301, maxerr 0.17480469, 95pct<0.0030, median<0.0012

For 7B perplexity with BLAS enabled we get 6.2038 after 655 chunks.

Quantization is slow (~90 seconds on my Mac for 7B) as not
multi-threaded as in PR #896.

* ggml : satisfy the sanitizer builds

Not sure why this makes them fail

* Better follow ggml conventions for function names

* Fixed type as per reviewer comment

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-04-19 20:20:14 +02:00
Georgi Gerganov
884e7d7a2b ggml : use 8-bit precision for Q4_1 intermediate results (#1047)
* ggml : use 8-bit precision for Q4_1 intermediate results (ARM)

* ggml : optimize ggml_vec_dot_q4_1_q8_0() via vmalq_n_f32

56 ms/token with Q4_1 !

* ggml : AVX2 implementation of ggml_vec_dot_q4_1_q8_0 (#1051)

* gitignore : ignore ppl-*.txt files

---------

Co-authored-by: slaren <2141330+slaren@users.noreply.github.com>
2023-04-19 20:10:08 +03:00
Georgi Gerganov
7cd5c4a3e9 readme : add warning about Q4_2 and Q4_3 2023-04-19 19:07:54 +03:00
Stephan Walter
f3d4edf504 ggml : Q4 cleanup - remove 4-bit dot product code (#1061)
* Q4 cleanup

* Remove unused AVX512 Q4_0 code
2023-04-19 19:06:37 +03:00
slaren
8944a13296 Add NVIDIA cuBLAS support (#1044) 2023-04-19 11:22:45 +02:00
slaren
6667401238 Multi-threaded ggml_cpy (#1035)
* Multi-threaded ggml_cpy

* Update ggml.c

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Also fix wdata offset in ggml_compute_forward_add_q_f32

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-04-19 00:53:24 +02:00
Georgi Gerganov
77a73403ca ggml : add new Q4_2 quantization (ARM only) (#1046)
* ggml : Q4_2 ARM

* ggml : add ggml_is_quantized()

* llama : update llama_type_name() with Q4_2 entry

* ggml : speed-up q4_2

- 4 threads: ~100ms -> ~90ms
- 8 threads:  ~55ms -> ~50ms

* ggml : optimize q4_2 using vmlaq_n_f32 + vmulq_n_f32
2023-04-18 23:54:57 +03:00
Georgi Gerganov
50a8a2af97 ggml : scratch that - vmlaq_n_f32 is always better
Had a background process that was messing with the timings
2023-04-18 23:11:23 +03:00
Georgi Gerganov
4caebf6d40 gitignore : vdot 2023-04-18 23:00:08 +03:00
Georgi Gerganov
dcdd65e296 ggml : optimize ggml_vec_dot_q4_0_q8_0() using vectorized accumulators 2023-04-18 22:59:17 +03:00
Kawrakow
5ecff35151 Adding a simple program to measure speed of dot products (#1041)
On my Mac, the direct Q4_1 product is marginally slower
(~69 vs ~55 us for Q4_0). The SIMD-ified ggml version
is now almost 2X slower (~121 us).

On a Ryzen 7950X CPU, the direct product for Q4_1 quantization
is faster than the AVX2 implementation (~60 vs ~62 us).

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-04-18 19:00:14 +00:00
Georgi Gerganov
7faa7460f0 readme : update hot topics about new LoRA functionality 2023-04-18 20:10:26 +03:00
Georgi Gerganov
5af8e32238 ci : do not run on drafts 2023-04-18 19:57:06 +03:00
Ivan Komarov
42747220b4 Do not close file after mmap (Windows version) (#1034) 2023-04-18 03:15:50 +02:00
Atsushi Tatsuma
e9298af389 readme : add Ruby bindings (#1029) 2023-04-17 22:34:35 +03:00
Cameron
4ad73137a1 add 4_0 to default outfile namestr dict (#1031)
this came up when trying to convert the gpt4all-lora-unfiltered-quantized.bin file
2023-04-17 20:26:23 +02:00
slaren
315a95a4d3 Add LoRA support (#820) 2023-04-17 17:28:55 +02:00
Arik Poznanski
efd05648c8 llama : well-defined static initialization of complex objects (#927)
* Replaced static initialization of complex objects with a initialization on first use. This prevents an undefined behavior on program run, for example, crash in Release build, works in Debug build

* replaced use of auto with exact type to avoid using -std=c++14

* Made the assessors functions for static maps be static const
2023-04-17 17:41:53 +03:00
Georgi Gerganov
eb17a026fd quantize-stats : fix bug in --type argument 2023-04-17 17:31:06 +03:00
Georgi Gerganov
69b740289f ggml : avoid using ggml_fp16_to_fp32() and ggml_fp32_to_fp16() in ggml.c 2023-04-17 16:16:23 +03:00
Ivan Komarov
f266259ad9 Speedup the AVX-512 implementation of ggml_vec_dot_q4_0() (#933) 2023-04-17 15:10:57 +02:00
slaren
47f61aaa5f Fix: do not close file on mmap (#1017) 2023-04-16 21:27:38 +02:00
Georgi Gerganov
3173a62eb9 stdout : vertical align outputs for better readibility 2023-04-16 13:59:27 +03:00
Pavol Rusnak
489537e6cf examples: add missing <ctime> include for time() (#1011) 2023-04-16 10:13:00 +00:00
nanahi
2d3481c721 Fix msys2 build error and warnings (#1009) 2023-04-16 11:13:42 +02:00
comex
74f5899df4 convert.py: Fix loading safetensors and ggml format on Windows (#991)
Calling `mmap.mmap` on Windows apparently resets the file offset of the
raw file object (and makes the BufferedReader return a *negative* file
offset).  For safetensors, avoid using the file offset after calling
mmap.  For GGML format, explicitly save and restore the offset.

Fixes #966.
2023-04-15 23:53:21 +02:00
Stephan Walter
2f7c8e014e Fix potential int8 overflow in non-SIMD vec_dot (#986) 2023-04-15 18:28:56 +00:00
Stephan Walter
0ad964631f Refactor ggml.c for future tensor types (#1001) 2023-04-15 16:25:38 +00:00
Georgi Gerganov
e95b6554b4 ggml : add Q8_0 quantization for intermediate results (#951)
* ggml : add Q8_0 quantization for intermediate results

* quantize-stats : fix test + add it to Makefile default

* Q8: use int8_t, AVX/AVX2 optimizations

* ggml : fix quantize_row_q8_0() ARM_NEON rounding

* minor : updates after rebase to latest master

* quantize-stats : delete obsolete strings

* ggml : fix q4_1 dot func

---------

Co-authored-by: Stephan Walter <stephan@walter.name>
2023-04-15 17:53:22 +03:00
Georgi Gerganov
aa485cee33 ggml : use posix_memalign on non-Windows env 2023-04-15 14:25:45 +03:00
Ivan Komarov
c12b14b77f benchmark : fix result validation in benchmark-q4_0-matmult (#987) 2023-04-15 08:51:54 +03:00
katsu560
106faaf297 cmake : add finding the OpenBLAS header file (#992) 2023-04-15 08:51:11 +03:00
Pavol Rusnak
c85e03d12e Revert "main : alternative instruct mode (Vicuna support, etc.) (#863)" (#982)
This reverts commit f4d277ae17.
2023-04-14 22:58:43 +03:00
Pavol Rusnak
489093548c py : bump sentencepiece to 0.1.98 to support Python 3.11 (#976) 2023-04-14 19:46:49 +00:00
Stephan Walter
93265e988a make : fix dependencies, use auto variables (#983) 2023-04-14 22:39:48 +03:00
Pavol Rusnak
c56b715269 Expose type name from ggml (#970)
Avoid duplication of type names in utils

Co-authored-by: Håkon H. Hitland <haakon@likedan.net>
2023-04-14 20:05:37 +02:00
Tomáš Pazdiora
f4d277ae17 main : alternative instruct mode (Vicuna support, etc.) (#863)
* Add support for configs, add configurable prefixes / suffixes, deprecate instruct mode, add stop prompt

* Add multiline mode, update text input.

* bugfix

* update implementation

* typos

* Change --multiline implementation to be toggled by EOF.

* bugfix

* default multiline mode

* add more configs

* update formating

* update formatting

* apply suggestions
2023-04-14 18:19:17 +03:00
Kerfuffle
c9a59b70a5 ggml : add unary and binary map operations (#874)
* GGML map ops proof of concept.

* Various cleanups.

Add handling for task setting.

Add handling for ggml_compute_backward.

Rename functions to ggml_map_unary_f32 and ggml_map_binary_f32

Fix compiler warnings related to casting function pointers and `void *`

Reorder functions and definitions based on the GGML op number.

Use typedefs for map op function pointer types.

* Fix position of map ops cases in ggml_compute_forward
2023-04-14 17:43:55 +03:00
Pavol Rusnak
a32f7acc9f py : cleanup dependencies (#962)
after #545 we do not need torch, tqdm and requests in the dependencies
2023-04-14 15:37:11 +02:00
Pavol Rusnak
43ffdefb74 py : fix flake8 and isort nitpicks (#960) 2023-04-14 14:23:21 +02:00
Georgi Gerganov
1623a6e9b4 ggml : minor 2023-04-14 13:31:29 +03:00
Georgi Gerganov
c14e0d2f23 ggml : always allocate buffers with size multiple of GGML_MEM_ALIGN 2023-04-14 13:31:15 +03:00
comex
723dac55fa py : new conversion script (#545)
Current status: Working, except for the latest GPTQ-for-LLaMa format
  that includes `g_idx`.  This turns out to require changes to GGML, so
  for now it only works if you use the `--outtype` option to dequantize it
  back to f16 (which is pointless except for debugging).

  I also included some cleanup for the C++ code.

  This script is meant to replace all the existing conversion scripts
  (including the ones that convert from older GGML formats), while also
  adding support for some new formats.  Specifically, I've tested with:

  - [x] `LLaMA` (original)
  - [x] `llama-65b-4bit`
  - [x] `alpaca-native`
  - [x] `alpaca-native-4bit`
  - [x] LLaMA converted to 'transformers' format using
        `convert_llama_weights_to_hf.py`
  - [x] `alpaca-native` quantized with `--true-sequential --act-order
        --groupsize 128` (dequantized only)
  - [x] same as above plus `--save_safetensors`
  - [x] GPT4All
  - [x] stock unversioned ggml
  - [x] ggmh

  There's enough overlap in the logic needed to handle these different
  cases that it seemed best to move to a single script.

  I haven't tried this with Alpaca-LoRA because I don't know where to find
  it.

  Useful features:

  - Uses multiple threads for a speedup in some cases (though the Python
    GIL limits the gain, and sometimes it's disk-bound anyway).

  - Combines split models into a single file (both the intra-tensor split
    of the original and the inter-tensor split of 'transformers' format
    files).  Single files are more convenient to work with and more
    friendly to future changes to use memory mapping on the C++ side.  To
    accomplish this without increasing memory requirements, it has some
    custom loading code which avoids loading whole input files into memory
    at once.

  - Because of the custom loading code, it no longer depends in PyTorch,
    which might make installing dependencies slightly easier or faster...
    although it still depends on NumPy and sentencepiece, so I don't know
    if there's any meaningful difference.  In any case, I also added a
    requirements.txt file to lock the dependency versions in case of any
    future breaking changes.

  - Type annotations checked with mypy.

  - Some attempts to be extra user-friendly:

      - The script tries to be forgiving with arguments, e.g. you can
        specify either the model file itself or the directory containing
        it.

      - The script doesn't depend on config.json / params.json, just in
        case the user downloaded files individually and doesn't have those
        handy.  But you still need tokenizer.model and, for Alpaca,
        added_tokens.json.

      - The script tries to give a helpful error message if
        added_tokens.json is missing.
2023-04-14 10:03:03 +03:00
Georgi Gerganov
0f07cacb05 ggml : fix q4_1 dot product types 2023-04-14 09:45:42 +03:00
Howard Su
c5d70f5c9e ggml : optimize rope function to avoid call powf in the tight loop (#807) 2023-04-14 09:24:52 +03:00
Gary Linscott
be87b6ed20 perplexity : add support for batch size to --perplexity (#407)
* Add support to batch size for perplexity

* Revert "Fix memory allocation issues and seg faults"

This reverts commit 4870e455b3.

* update from merge

* Remove perplexity from main

* updates

* Update batch size for efficiency
2023-04-14 00:50:42 +03:00
CRD716
0e07e6a839 common : remove unnecessary includes (#947) 2023-04-13 18:39:25 +03:00
Georgi Gerganov
a3a2a0eda8 ggml : add GGML_DEFAULT_N_THREADS 2023-04-13 18:36:48 +03:00
Georgi Gerganov
d990e3fffc ggml : speed-up ggml_vec_dot_q4_1() ARM_NEON + 32-bit ARM support (#900)
* ggml : speed-up q4_1 ARM_NEON by ~5%

* ggml : implement vaddvq when missing

* ggml : implement vminvq and vmaxvq when missing

* ggml : implement vzip when missing

* ggml : fix comment

* ggml : try to use correct ifdef
2023-04-13 18:32:36 +03:00
Georgi Gerganov
9190e8eac8 llama : merge llama_internal.h into llama.h
Hide it behind an #ifdef
2023-04-13 18:04:45 +03:00
Georgi Gerganov
c85980acd0 gitignore : benchmark 2023-04-13 18:01:33 +03:00
Stephan Walter
6232f2d7fd ggml : optimize non-SIMD Q4_0 vector dot product (#703) 2023-04-13 17:59:50 +03:00
Pavol Rusnak
6c248707f5 ggml : introduce GGML_ALIGNED_MALLOC/GGML_ALIGNED_FREE macros (#884)
which allows us to use aligned_alloc or _aligned_malloc functions
2023-04-13 17:08:32 +03:00
CRD716
8cda5c981d fix whitespace (#944) 2023-04-13 16:03:57 +02:00
CRD716
ec29272175 readme : remove python 3.10 warning (#929) 2023-04-13 16:59:53 +03:00
Genkagaku.GPT
7e941b95eb readme : llama node binding (#911)
* chore: add nodejs binding

* chore: add nodejs binding
2023-04-13 16:54:27 +03:00
Pavol Rusnak
c729ff730a flake.nix: add all binaries from bin (#848) 2023-04-13 15:49:05 +02:00
Judd
4579af95e8 zig : update build.zig (#872)
* update

* update readme

* minimize the changes.

---------

Co-authored-by: zjli2019 <zhengji.li@ingchips.com>
2023-04-13 16:43:22 +03:00
Vladimir
8c3ffc2f04 ggml : update cblas_sgemm columns var to be more reasonable (#838) 2023-04-13 16:24:30 +03:00
niansa/tuxifan
107980d970 examples : add -n to alpaca and gpt4all scripts (#706) 2023-04-13 16:03:39 +03:00
anzz1
585d91a156 cmake : add explicit F16C option (x86) (#576)
Fixes building for x86 processors missing F16C featureset
MSVC not included, as in MSVC F16C is implied with AVX2/AVX512
2023-04-13 15:48:21 +03:00
SebastianApel
95ea26f6e9 benchmark : add tool for timing q4_0 matrix multiplication (#653)
* Initial version of q4_0 matrix multiplication benchmark

* Bugfix: Added dependency to ggml.o to benchmark

* Reviewer requests: added parameter for threads, switched to ggml_time_us()

* Reviewer input: removed rtsc, use epsilon for check

* Review comment: Removed set_locale

* Feature: Param for numer of iterations, Bugfix for use of parameter threads

* Reviewer suggestion: Moved to examples

* Reviewer feedback: Updated clean: and benchmark: sections

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-04-13 15:46:23 +03:00
Pavol Rusnak
82d146df9b do not force the prompt file to end with a new line (#908) 2023-04-13 11:33:16 +02:00
Stephan Walter
e7f6997f89 Don't crash on ftype (formerly f16) == 4 (#917) 2023-04-12 15:06:16 +00:00
Georgi Gerganov
f76cb3a34d readme : change "GPU support" link to discussion 2023-04-12 14:48:57 +03:00
Georgi Gerganov
782438070f readme : update hot topics with link to "GPU support" issue 2023-04-12 14:31:12 +03:00
Nicolai Weitkemper
4dbbd40750 readme: link to sha256sums file (#902)
This is to emphasize that these do not need to be obtained from elsewhere.
2023-04-12 08:46:20 +02:00
Pavol Rusnak
8b679987cd Fix whitespace, add .editorconfig, add GitHub workflow (#883) 2023-04-11 19:45:44 +00:00
Stephan Walter
3e6e70d8e8 Add enum llama_ftype, sync ggml_type to model files (#709) 2023-04-11 15:03:51 +00:00
comex
2663d2c678 Windows fixes (#890)
Mostly for msys2 and mingw64 builds, which are different from each other
and different from standard Visual Studio builds.  Isn't Windows fun?

- Define _GNU_SOURCE in more files (it's already used in ggml.c for
  Linux's sake).

- Don't use PrefetchVirtualMemory if not building for Windows 8 or later
  (mingw64 doesn't by default).  But warn the user about this situation
  since it's probably not intended.

- Check for NOMINMAX already being defined, which it is on mingw64.

- Actually use the `increment` variable (bug in my `pizza` PR).

- Suppress unused variable warnings in the fake pthread_create and
  pthread_join implementations for Windows.

- (not Windows-related) Remove mention of `asprintf` from comment;
  `asprintf` is no longer used.

Fixes #871.
2023-04-11 15:19:54 +02:00
qouoq
a0caa34b16 Add BAIR's Koala to supported models (#877) 2023-04-10 22:41:53 +02:00
Georgi Gerganov
461ba9e66e ggml : fix WASM build 2023-04-10 23:20:01 +03:00
Georgi Gerganov
c3ac702e5e ggml : add ggml_cont() + optimize ggml_cpy() for contiguous dst 2023-04-10 22:42:28 +03:00
Georgi Gerganov
9d634ef452 ggml : remove trailing whitespaces 2023-04-10 22:42:28 +03:00
Marco Matthies
d9a239c410 Simplify to include lower-case windows.h always, fix compile on mingw32 (#747) 2023-04-10 19:57:59 +02:00
Georgi Gerganov
684da25926 ggml : fix quantize_row_q4_1() ARM_NEON (close #876) 2023-04-10 19:29:48 +03:00
comex
180b693a47 Print model version.
Also improve model type printing, and fix indentation of an unrelated
switch statement.
2023-04-10 01:10:46 +02:00
comex
f963b63afa Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt).  (However, I didn't
  include the hack needed to support GPT4All files without conversion.
  Those can still be used after converting them with convert.py from my
  other PR.)

- Support both mmap and read (mmap is used by default, but can be
  disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
  files or on platforms where mmap is not supported).

- Support multi-file models like before, but automatically determine the
  number of parts rather than requiring `--n_parts`.

- Improve validation and error checking.

- Stop using the per-file type field (f16) entirely in favor of just
  relying on the per-tensor type/size fields.  This has no immediate
  benefit, but makes it easier to experiment with different formats, and
  should make it easier to support the new GPTQ-for-LLaMa models in the
  future (I have some work in progress on that front).

- Support VirtualLock on Windows (using the same `--mlock` option as on
  Unix).

    - Indicate loading progress when using mmap + mlock.  (Which led me
      to the interesting observation that on my Linux machine, with a
      warm file cache, mlock actually takes some time, whereas mmap
      without mlock starts almost instantly...)

      - To help implement this, move mlock support from ggml to the
        loading code.

- madvise/PrefetchVirtualMemory support (based on #740)

- Switch from ifstream to the `fopen` family of functions to avoid
  unnecessary copying and, when mmap is enabled, allow reusing the same
  file descriptor for both metadata reads and mmap (whereas the existing
  implementation opens the file a second time to mmap).

- Quantization now produces a single-file output even with multi-file
  inputs (not really a feature as much as 'it was easier this way').

Implementation notes:

I tried to factor the code into more discrete pieces than before.

Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:

- Destructors to make it easier to ensure everything gets cleaned up.

- Exceptions.  I don't even usually use exceptions when writing C++, and
  I can remove them if desired... but here they make the loading code
  much more succinct while still properly handling a variety of errors,
  ranging from API calls failing to integer overflow and allocation
  failure.  The exceptions are converted to error codes at the
  API boundary.)

Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-10 01:10:46 +02:00
Tomáš Pazdiora
aaf3b23deb fix for windows utf-8 input (#840)
Use UTF-16 as input on Windows, since UTF-8 does not work and reads multibyte characters as zeros
2023-04-08 17:49:39 +02:00
eiery
f2d1c47294 cmake should link openblas properly with -lopenblas like how it's done in the makefile (#839) 2023-04-08 11:15:17 +00:00
lon
317fb12fbd Add new binaries to flake.nix (#847) 2023-04-08 12:04:23 +02:00
unbounded
62cfc54f77 Add quantize-stats command for testing quantization (#728)
Command that calculates some statistics over the errors introduced by
quantization, like mean square error, max error and some percentile errors for layer
weights. Should be useful for testing quantization improvements.

Exposes some internal state from ggml and llama for testing
2023-04-08 00:09:18 +02:00
bhubbb
698f7b5d63 make : add libllama.so target for llama-cpp-python (#797)
I was able to get llama-cpp-python working but only when I build libllama.so with make.
2023-04-07 19:11:58 +03:00
iacore
c1950c3431 zig : don't link examples/common.cpp for non-example (#814) 2023-04-07 19:05:29 +03:00
Ivan Stepanov
4953e9007f llama : always sort logits before nucleus sampling (#812)
* Always sort logits before nucleus sampling

* remove second normalization

- fix windows build
- remove normalization since std::discrete_distribution does not require it
2023-04-07 19:02:12 +03:00
Sergey Alirzaev
cc9cee8e9e Do not crash when it has nothing to say. (#796)
Otherwise observing this in the interactive mode:
/usr/lib/gcc/x86_64-pc-linux-gnu/12/include/g++-v12/bits/stl_vector.h:1230: reference std::vector<int>::back() [_Tp = int, _Alloc = std::allocator<int>]: Assertion '!this->empty()' failed.
2023-04-06 17:59:11 +02:00
Pavol Rusnak
d2beca95dc Make docker instructions more explicit (#785) 2023-04-06 08:56:58 +02:00
Georgi Gerganov
eeaa7b0492 ggml : multi-thread ggml_rope() (~3-4 times faster on M1) (#781) 2023-04-05 22:11:03 +03:00
Georgi Gerganov
986b6ce9f9 ggml, llama : avoid heavy V transpose + improvements (#775)
ggml :

- added ggml_view_3d()
- ggml_view_tensor() now inherits the stride too
- reimplement ggml_cpy() to account for dst stride
- no longer require tensor->data to be memory aligned

llama :

- compute RoPE on 32-bit tensors (should be more accurate)
- store RoPE-ed K in the KV cache
- store transposed V in the KV cache (significant speed-up)
- avoid unnecessary Q copy
2023-04-05 22:07:33 +03:00
Georgi Gerganov
3416298929 Update README.md 2023-04-05 19:54:30 +03:00
Ivan Stepanov
5a8c4f6240 llama : define non-positive top_k; top_k range check (#779)
* Define non-positive top_k; top_k range check

* minor : brackets

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-04-05 19:20:05 +03:00
at8u
ff05d05c96 miku.sh : add executable bit (#780) 2023-04-05 18:59:13 +03:00
Georgi Gerganov
62b3e81aae media : add logos and banners 2023-04-05 18:58:31 +03:00
Georgi Gerganov
8d10406d6e readme : change logo + add bindings + add uis + add wiki 2023-04-05 18:56:20 +03:00
iacore
ed1c214e66 zig : add build.zig (#773)
Co-authored-by: Locria Cyber <74560659+locriacyber@users.noreply.github.com>
2023-04-05 18:06:02 +03:00
Ivan Stepanov
0c44427df1 make : missing host optimizations in CXXFLAGS (#763) 2023-04-05 17:38:37 +03:00
Adithya Balaji
594cc95fab readme : update with CMake and windows example (#748)
* README: Update with CMake and windows example

* README: update with code-review for cmake build
2023-04-05 17:36:12 +03:00
at8u
88ed5761b8 examples : add Miku.sh (#724)
* Add Miku.sh to examples

* Add missing line to prompt in Miku.sh

* Add --keep param to Miku.sh

* Remove '[end_of_conversation]' line from Miku.sh

No longer is necessary.
2023-04-05 17:32:42 +03:00
Andrew Duffy
58c438cf7d Add Accelerate/BLAS when using Swift (#765) 2023-04-05 06:44:24 -04:00
mgroeber9110
53dbba7695 Windows: reactive sigint handler after each Ctrl-C (#736) 2023-04-03 18:00:55 +02:00
SebastianApel
437e77855a 10+% performance improvement of ggml_vec_dot_q4_0 on AVX2 (#654)
* Performance improvement of AVX2 code
* Fixed problem with MSVC compiler
* Reviewer comments: removed double semicolon, deleted empty line 1962
2023-04-03 09:52:28 +02:00
Ivan Stepanov
cd7fa95690 Define non-positive temperature behavior (#720) 2023-04-03 02:19:04 +02:00
bsilvereagle
a0c0516416 Remove torch GPU dependencies from the Docker.full image (#665)
By using `pip install torch --index-url https://download.pytorch.org/whl/cpu`
instead of `pip install torch` we can specify we want to install a CPU-only version
of PyTorch without any GPU dependencies. This reduces the size of the Docker image
from 7.32 GB to 1.62 GB
2023-04-03 00:13:03 +02:00
Thatcher Chamberlin
d8d4e865cd Add a missing step to the gpt4all instructions (#690)
`migrate-ggml-2023-03-30-pr613.py` is needed to get gpt4all running.
2023-04-02 12:48:57 +02:00
Christian Falch
e986f94829 Added api for getting/setting the kv_cache (#685)
The api provides access methods for retrieving the current memory buffer for the kv_cache and its token number.
It also contains a method for setting the kv_cache from a memory buffer.

This makes it possible to load/save history - maybe support --cache-prompt paramater as well?

Co-authored-by: Pavol Rusnak <pavol@rusnak.io>
2023-04-02 12:23:04 +02:00
Marian Cepok
c0bb1d3ce2 ggml : change ne to int64_t (#626) 2023-04-02 13:21:31 +03:00
Leonardo Neumann
6e7801d08d examples : add gpt4all script (#658) 2023-04-02 10:56:20 +03:00
Stephan Walter
81040f10aa llama : do not allocate KV cache for "vocab_only == true" (#682)
Fixes sanitizer CI
2023-04-02 10:18:53 +03:00
Fabian
c4f89d8d73 make : use -march=native -mtune=native on x86 (#609) 2023-04-02 10:17:05 +03:00
Murilo Santana
5b70e7de4c fix default params for examples/main (#697) 2023-04-02 04:41:12 +02:00
Ikko Eltociear Ashimine
a717cba844 py: huggingface -> Hugging Face (#686) 2023-04-01 18:38:18 +02:00
rimoliga
d0a7f742e7 readme: replace termux links with homepage, play store is deprecated (#680) 2023-04-01 16:57:30 +02:00
Slaren
0d054e292e Show error message when -f fails 2023-04-01 16:08:40 +02:00
Stephan Walter
3525899277 Enable -std= for cmake builds, fix warnings (#598) 2023-03-31 19:19:16 +00:00
slaren
1d08882afa Optimize AVX2 ggml_vec_dot_q4_0 (#642) 2023-03-31 15:55:52 +00:00
perserk
02c5b27e91 Add AVX acceleration (#617)
* ggml : add AVX quantize_row_q4_0()

* ggml : add AVX ggml_vec_dot_q4_0()

* ggml : refactor AVX part of ggml_vec_dot_q4_0()

https://github.com/ggerganov/llama.cpp/pull/617#issuecomment-1489985645
2023-03-31 13:55:44 +02:00
Pavol Rusnak
cbef542879 py : cleanup the code
- use f-strings where possible
- drop first param of encode/decode functions since "utf-8" is the default
2023-03-31 10:32:01 +02:00
Pavol Rusnak
9733104be5 drop quantize.py (now that models are using a single file) 2023-03-31 01:07:32 +02:00
Georgi Gerganov
3df890aef4 readme : update supported models 2023-03-30 22:31:54 +03:00
Justine Tunney
ee0c40dd6d Introduce GGML migration tool for new file format
If you deleted your old Meta LLaMA .pth files, then the
migrate-ggml-2023-03-30-pr613.py script will allow you to convert your
old ggml files into the new mmap()'able format.

See #613
2023-03-30 12:28:25 -07:00
Justine Tunney
6f23ba5ee2 Ensure --mlock works properly with mmap() support 2023-03-30 12:28:25 -07:00
Justine Tunney
78ca9838ee Make loading weights 10-100x faster
This is a breaking change that's going to give you three benefits:

1. Your inference commands should load 100x faster
2. You may be able to safely load models 2x larger
3. You can run many concurrent inference processes

This was accomplished by changing the file format so we can mmap()
weights directly into memory without having to read() or copy them
thereby ensuring the kernel can make its file cache pages directly
accessible to our inference processes; and secondly, that the file
cache pages are much less likely to get evicted (which would force
loads to hit disk) because they're no longer competing with memory
pages that were needlessly created by gigabytes of standard i/o.

The new file format supports single-file models like LLaMA 7b, and
it also supports multi-file models like LLaMA 13B. Our Python tool
now merges the foo.1, foo.2, etc. files back into a single file so
that the C++ code which maps it doesn't need to reshape data every
time. That's made llama.cpp so much simpler. Much of its load code
has now been deleted.

Furthermore, this change ensures that tensors are aligned properly
on a 32-byte boundary. That opens the door to seeing if we can get
additional performance gains on some microprocessors, by using ops
that require memory alignment.

Lastly note that both POSIX and the Windows platform are supported

Fixes #91
2023-03-30 12:28:25 -07:00
Slaren
a017390358 Initial windows support (untested) 2023-03-30 12:28:25 -07:00
Slaren
ac184d5147 Always initialize mm_addr and mm_length in llama_model 2023-03-30 12:28:25 -07:00
Slaren
276e5b7811 Unmap the file in llama_free 2023-03-30 12:28:25 -07:00
Slaren
d68c5dc435 Make mmap_file static 2023-03-30 12:28:25 -07:00
Slaren
64bde3ffd4 Fix ggml_init_params in quantize 2023-03-30 12:28:25 -07:00
Slaren
c03ae8dca1 Add mmap support for model files 2023-03-30 12:28:25 -07:00
Stephan Walter
3bcc129ba8 cmake : properly invoke CTest (#629) 2023-03-30 20:56:59 +03:00
Casey Primozic
a4755cf288 Remove unused variable (#607)
* It seems some new warning were added recently that exposed this.  I wrote the code that included this unused variable originally and it is indeed not needed.
2023-03-30 17:53:35 +00:00
david raistrick
1f0414feec make : fix darwin f16c flags check (#615)
...there was no check.  ported upstream from https://github.com/zanussbaum/gpt4all.cpp/pull/2 (I dont see any clean path for upstream patches)
2023-03-30 20:34:45 +03:00
Georgi Gerganov
77efdf5a50 ggml : fix NEON signs (close #620, #622) 2023-03-30 20:27:32 +03:00
slaren
ed3c680bcd Fix GGML_F32Cx8_STORE in AVX without F16C path (#619) 2023-03-30 11:16:30 +02:00
anzz1
9cbc404ba6 ci : re-enable AVX512 testing (Windows-MSVC) (#584)
* CI: Re-enable AVX512 testing (Windows-MSVC)

Now with 100% less base64 encoding

* plain __cpuid is enough here
2023-03-29 23:44:39 +03:00
Georgi Gerganov
b51c717d5c ggml : init time on first ggml_init() call 2023-03-29 22:15:34 +03:00
Georgi Gerganov
0ba76c1e73 llama : fix compile warnings when reading the vocab 2023-03-29 22:13:12 +03:00
Georgi Gerganov
cea1c85948 ggml : add ARM_NEON dequantize_row_q4_1() 2023-03-29 22:10:01 +03:00
Georgi Gerganov
f202ada131 ggml : add ARM_NEON quantize_row_q4_1() 2023-03-29 22:03:07 +03:00
Georgi Gerganov
3b44d30d9b ggml : add ARM_NEON ggml_vec_dot_q4_1() 2023-03-29 22:03:07 +03:00
Pavol Rusnak
61cbfff5c9 rename convert_ggml_to_pth.py -> convert-ggml-to-pth.py (#600)
to match filenames of other converters
2023-03-29 20:09:25 +02:00
Thérence
d9ad104440 Create chat-13B.bat (#592)
* Create chat-13B.bat

Same script than chat-13B.sh, but for windows users.
Tested and working on windows 10/11 v 22H2

* Apply suggestions from code review

---------

Co-authored-by: anzz1 <anzz1@live.com>
2023-03-29 20:21:09 +03:00
Georgi Gerganov
b467702b87 readme : fix typos 2023-03-29 19:38:31 +03:00
Georgi Gerganov
516d88e75c readme : add GPT4All instructions (close #588) 2023-03-29 19:37:20 +03:00
Georgi Gerganov
53635c081c py : add GPT4All conversion script
For now: copy-paste
Too much time for me to deduplicate the python code
2023-03-29 19:29:52 +03:00
Maël Kerbiriou
41318d708e llama : use the same threshold for OpenBLAS and ggml thread limiting (#577) 2023-03-29 19:10:07 +03:00
Tobias Lütke
a6956b25a1 add example of re-act pattern (#583)
* add example of re-act pattern

* spelling...

* fixed whitespace in reverse prompt issue
2023-03-29 10:10:24 -05:00
anzz1
83df5639eb Fix GCC warning about binary literal (#595)
0b10101010 -> 0xAA /* 0b10101010 */
2023-03-29 13:20:07 +00:00
anzz1
a5c42c4b13 Fix typo in llama.h (#593) 2023-03-29 13:19:29 +00:00
anzz1
5a5f8b1501 Enable Fused-Multiply-Add (FMA) and F16C/CVT16 vector extensions on MSVC (#375)
* Enable Fused-Multiply-Add (FMA) instructions on MSVC

__FMA__ macro does not exist in MSVC

* Enable F16C/CVT16 vector extensions on MSVC

__F16C__ macro does not exist in MSVC, but is implied with AVX2/AVX512

* MSVC cvt intrinsics

* Add __SSE3__ macro for MSVC too because why not

even though it's not currently used for anything when AVX is defined
2023-03-28 22:44:29 +03:00
anzz1
f1217055ea CI: fix subdirectory path globbing (#546)
- Changes in subdirectories will now be detecter properly
- (Windows-MSVC) AVX512 tests temporarily disabled
2023-03-28 22:43:25 +03:00
anzz1
7f4c5c6651 llama : fix linkage with mingw (#551)
* Revert 7e53955 (#542)

Still needs to be fixed properly

* Fix linking on mingw32
2023-03-28 21:23:09 +03:00
148 changed files with 81714 additions and 5881 deletions

18
.clang-tidy Normal file
View File

@@ -0,0 +1,18 @@
---
Checks: >
bugprone-*,
-bugprone-easily-swappable-parameters,
-bugprone-implicit-widening-of-multiplication-result,
-bugprone-narrowing-conversions,
readability-*,
-readability-avoid-unconditional-preprocessor-if,
-readability-function-cognitive-complexity,
-readability-identifier-length,
-readability-implicit-bool-conversion,
-readability-magic-numbers,
-readability-uppercase-literal-suffix,
clang-analyzer-*,
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
performance-*,
portability-*,
FormatStyle: none

View File

@@ -3,10 +3,12 @@ ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build
RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip
apt-get install -y build-essential python3 python3-pip git
COPY requirements.txt requirements.txt
RUN pip install --upgrade pip setuptools wheel \
&& pip install numpy requests sentencepiece torch tqdm
&& pip install -r requirements.txt
WORKDIR /app
@@ -14,4 +16,6 @@ COPY . .
RUN make
ENV LC_ALL=C.utf8
ENTRYPOINT ["/app/.devops/tools.sh"]

View File

@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build
RUN apt-get update && \
apt-get install -y build-essential
apt-get install -y build-essential git
WORKDIR /app
@@ -15,4 +15,6 @@ FROM ubuntu:$UBUNTU_VERSION as runtime
COPY --from=build /app/main /main
ENTRYPOINT [ "/main" ]
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/main" ]

View File

@@ -11,7 +11,7 @@ shift
arg2="$@"
if [[ $arg1 == '--convert' || $arg1 == '-c' ]]; then
python3 ./convert-pth-to-ggml.py $arg2
python3 ./convert.py $arg2
elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then
./quantize $arg2
elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then
@@ -23,7 +23,7 @@ elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
else
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
./quantize "$i" "${i/f16/q4_0}" 2
./quantize "$i" "${i/f16/q4_0}" q4_0
fi
done
else
@@ -32,7 +32,7 @@ else
echo " --run (-r): Run a model previously converted into ggml"
echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
echo " --convert (-c): Convert a llama model into ggml"
echo " ex: \"/models/7B/\" 1"
echo " ex: --outtype f16 \"/models/7B/\" "
echo " --quantize (-q): Optimize with quantization process ggml"
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
echo " --all-in-one (-a): Execute --convert & --quantize"

View File

@@ -21,4 +21,4 @@ models/*
arm_neon.h
compile_commands.json
Dockerfile
Dockerfile

5
.ecrc Normal file
View File

@@ -0,0 +1,5 @@
{
"Disable": {
"IndentSize": true
}
}

19
.editorconfig Normal file
View File

@@ -0,0 +1,19 @@
# https://EditorConfig.org
# Top-most EditorConfig file
root = true
# Unix-style newlines with a newline ending every file, utf-8 charset
[*]
end_of_line = lf
insert_final_newline = true
trim_trailing_whitespace = true
charset = utf-8
indent_style = space
indent_size = 4
[Makefile]
indent_style = tab
[prompts/*.txt]
insert_final_newline = unset

2
.flake8 Normal file
View File

@@ -0,0 +1,2 @@
[flake8]
max-line-length = 125

View File

@@ -22,9 +22,9 @@ Please provide a detailed written description of what you were trying to do, and
# Current Behavior
Please provide a detailed written description of what `llama.cpp` did, instead.
Please provide a detailed written description of what `llama.cpp` did, instead.
# Environment and Context
# Environment and Context
Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions.
@@ -133,7 +133,7 @@ llama_model_load: loading model part 8/8 from './models/65B/ggml-model-q4_0.bin.
llama_model_load: .......................................................................................... done
llama_model_load: model size = 4869.09 MB / num tensors = 723
system_info: n_threads = 16 / 32 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 |
system_info: n_threads = 16 / 32 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 |
main: prompt: 'Please close your issue when it has been answered.'
main: number of tokens in prompt = 11
@@ -166,14 +166,14 @@ main: total time = 246406.42 ms
Performance counter stats for './main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p Please close your issue when it has been answered.':
3636882.89 msec task-clock # 14.677 CPUs utilized
13509 context-switches # 3.714 /sec
2436 cpu-migrations # 0.670 /sec
10476679 page-faults # 2.881 K/sec
3636882.89 msec task-clock # 14.677 CPUs utilized
13509 context-switches # 3.714 /sec
2436 cpu-migrations # 0.670 /sec
10476679 page-faults # 2.881 K/sec
13133115082869 cycles # 3.611 GHz (16.77%)
29314462753 stalled-cycles-frontend # 0.22% frontend cycles idle (16.76%)
10294402631459 stalled-cycles-backend # 78.39% backend cycles idle (16.74%)
23479217109614 instructions # 1.79 insn per cycle
23479217109614 instructions # 1.79 insn per cycle
# 0.44 stalled cycles per insn (16.76%)
2353072268027 branches # 647.002 M/sec (16.77%)
1998682780 branch-misses # 0.08% of all branches (16.76%)

View File

@@ -8,17 +8,19 @@ on:
required: true
type: boolean
push:
paths: ['.github/workflows/**', 'CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp']
branches:
- master
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
pull_request:
types: [opened, synchronize, edited, reopened, review_requested, ready_for_review]
paths: ['CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp']
types: [opened, synchronize, reopened]
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
jobs:
ubuntu-latest-make:
runs-on: ubuntu-latest
ubuntu-focal-make:
runs-on: ubuntu-20.04
steps:
- name: Clone
@@ -29,12 +31,12 @@ jobs:
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential
sudo apt-get install build-essential gcc-8
- name: Build
id: make_build
run: |
make
CC=gcc-8 make
ubuntu-latest-cmake:
runs-on: ubuntu-latest
@@ -73,7 +75,6 @@ jobs:
matrix:
sanitizer: [ADDRESS, THREAD, UNDEFINED]
build_type: [Debug, Release]
accelerate: [ON, OFF]
steps:
- name: Clone
@@ -91,7 +92,7 @@ jobs:
run: |
mkdir build
cd build
cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DLLAMA_ACCELERATE=${{ matrix.accelerate }}
cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
cmake --build . --config ${{ matrix.build_type }}
- name: Test
@@ -110,6 +111,7 @@ jobs:
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
@@ -119,7 +121,7 @@ jobs:
make
macOS-latest-cmake:
runs-on: macOS-latest
runs-on: macos-latest
steps:
- name: Clone
@@ -128,6 +130,7 @@ jobs:
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
@@ -147,22 +150,64 @@ jobs:
windows-latest-cmake:
runs-on: windows-latest
env:
OPENBLAS_VERSION: 0.3.23
OPENCL_VERSION: 2023.04.17
CLBLAST_VERSION: 1.6.0
strategy:
matrix:
include:
- build: 'avx2'
defines: ''
- build: 'avx'
defines: '-DLLAMA_AVX2=OFF'
- build: 'avx512'
defines: '-DLLAMA_AVX512=ON'
- build: 'avx2'
defines: '-DLLAMA_BUILD_SERVER=ON'
- build: 'avx'
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF'
- build: 'avx512'
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
- build: 'clblast'
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
- build: 'openblas'
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
- name: Download OpenCL SDK
id: get_opencl
if: ${{ matrix.build == 'clblast' }}
run: |
curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
mkdir $env:RUNNER_TEMP/opencl
tar.exe -xvf $env:RUNNER_TEMP/opencl.zip --strip-components=1 -C $env:RUNNER_TEMP/opencl
- name: Download CLBlast
id: get_clblast
if: ${{ matrix.build == 'clblast' }}
run: |
curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z
rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast
foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
$txt = Get-Content -Path $f -Raw
$txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
}
- name: Download OpenBLAS
id: get_openblas
if: ${{ matrix.build == 'openblas' }}
run: |
curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
mkdir $env:RUNNER_TEMP/openblas
tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas
$vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
$msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
$lib = $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
& $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
- name: Build
id: cmake_build
run: |
@@ -171,18 +216,36 @@ jobs:
cmake .. ${{ matrix.defines }}
cmake --build . --config Release
- name: Add clblast.dll
id: add_clblast_dll
if: ${{ matrix.build == 'clblast' }}
run: |
cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
- name: Add libopenblas.dll
id: add_libopenblas_dll
if: ${{ matrix.build == 'openblas' }}
run: |
cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
- name: Check AVX512F support
id: check_avx512f
if: ${{ matrix.build == 'avx512' }}
continue-on-error: true
run: |
cd build
Set-Content -Path .\avx512f.exe -Value ([Convert]::FromBase64String('TVqQAAMAAAAEAAAA//8AALgAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAyAAAAA4fug4AtAnNIbgBTM0hVGhpcyBwcm9ncmFtIGNhbm5vdCBiZSBydW4gaW4gRE9TIG1vZGUuDQ0KJAAAAAAAAAClmfXY4fibi+H4m4vh+JuL4fiai+P4m4si98aL4vibi7Xbq4vg+JuLUmljaOH4m4sAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABQRQAATAEBAGo6H2QAAAAAAAAAAOAADwELAQYAAAIAAAAAAAAAAAAADBAAAAAQAAAAIAAAAABAAAAQAAAAAgAABAAAAAAAAAAEAAAAAAAAAAAgAAAAAgAAAAAAAAMAAAAAABAAABAAAAAAEAAAEAAAAAAAABAAAAAAAAAAAAAAAFQQAAAoAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAAADAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAC50ZXh0AAAAsgAAAAAQAAAAAgAAAAIAAAAAAAAAAAAAAAAAACAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACUEAAAiBAAAAAAAABVi+xRUVNTuAcAAAAPosHrEGaD4wGJXfxbg0X8MI1F+GoAUI1F/GoBUGr1/xUAEEAAUP8VBBBAAItF/FuDwND32BvAQMnDzMx8EAAAAAAAAAAAAACkEAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAlBAAAIgQAAAAAAAApANXcml0ZUZpbGUAuQFHZXRTdGRIYW5kbGUAAEtFUk5FTDMyLmRsbAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA==')) -AsByteStream
.\avx512f.exe && echo " AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo " AVX512F: NO"
$vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
$msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
$cl = $(join-path $msvc 'bin\Hostx64\x64\cl.exe')
echo 'int main(void){unsigned int a[4];__cpuid(a,7);return !(a[1]&65536);}' >> avx512f.c
& $cl /O2 /GS- /kernel avx512f.c /link /nodefaultlib /entry:main
.\avx512f.exe && echo "AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo "AVX512F: NO"
- name: Test
id: cmake_test
if: ${{ matrix.build != 'avx512' || env.HAS_AVX512F == '1' }} # Test AVX-512 only when possible
if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # Test AVX-512 only when possible
run: |
cd build
ctest -C Release --verbose
@@ -196,6 +259,7 @@ jobs:
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
- name: Upload artifacts
@@ -205,17 +269,94 @@ jobs:
path: |
llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
windows-latest-cmake-cublas:
runs-on: windows-latest
strategy:
matrix:
cuda: ['12.1.0', '11.7.1']
build: ['cublas']
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
- uses: Jimver/cuda-toolkit@v0.2.10
id: cuda-toolkit
with:
cuda: ${{ matrix.cuda }}
# TODO(green-sky): _dev seems to fail, and non dev are not enought
#sub-packages: '["nvcc", "cudart", "cublas", "cudart_dev", "cublas_dev"]'
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON
cmake --build . --config Release
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: pr-mpt/actions-commit-hash@v2
- name: Pack artifacts
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v3
with:
path: |
llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
- name: Copy and pack Cuda runtime
if: ${{ matrix.cuda == '12.1.0' }}
# TODO(green-sky): paths are cuda 12 specific
run: |
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
mkdir '.\build\bin\cudart\'
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cudart64_12.dll" '.\build\bin\cudart\'
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublas64_12.dll" '.\build\bin\cudart\'
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublasLt64_12.dll" '.\build\bin\cudart\'
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip .\build\bin\cudart\*
- name: Copy and pack Cuda runtime
if: ${{ matrix.cuda == '11.7.1' }}
# TODO(green-sky): paths are cuda 11 specific
run: |
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
mkdir '.\build\bin\cudart\'
ls "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin"
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cudart64_110.dll" '.\build\bin\cudart\'
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublas64_11.dll" '.\build\bin\cudart\'
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublasLt64_11.dll" '.\build\bin\cudart\'
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip .\build\bin\cudart\*
- name: Upload Cuda runtime
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v3
with:
path: |
cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
runs-on: ubuntu-latest
needs:
- ubuntu-latest-make
- ubuntu-focal-make
- ubuntu-latest-cmake
- macOS-latest-make
- macOS-latest-cmake
- windows-latest-cmake
- windows-latest-cmake-cublas
steps:
- name: Download artifacts

View File

@@ -18,6 +18,8 @@ on:
jobs:
push_to_registry:
name: Push Docker image to Docker Hub
if: github.event.pull_request.draft == false
runs-on: ubuntu-latest
env:
COMMIT_SHA: ${{ github.sha }}
@@ -60,4 +62,4 @@ jobs:
push: ${{ github.event_name == 'push' }}
platforms: linux/amd64,linux/arm64
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
file: ${{ matrix.config.dockerfile }}
file: ${{ matrix.config.dockerfile }}

17
.github/workflows/editorconfig.yml vendored Normal file
View File

@@ -0,0 +1,17 @@
name: EditorConfig Checker
on:
push:
branches:
- master
pull_request:
branches:
- master
jobs:
editorconfig:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: editorconfig-checker/action-editorconfig-checker@main
- run: editorconfig-checker

20
.github/workflows/tidy-post.yml vendored Normal file
View File

@@ -0,0 +1,20 @@
name: clang-tidy review post comments
on:
workflow_dispatch:
workflows: ["clang-tidy-review"]
types:
- completed
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: ZedThree/clang-tidy-review/post@v0.13.0
# lgtm_comment_body, max_comments, and annotations need to be set on the posting workflow in a split setup
with:
# adjust options as necessary
lgtm_comment_body: ''
annotations: false
max_comments: 25

23
.github/workflows/tidy-review.yml vendored Normal file
View File

@@ -0,0 +1,23 @@
name: clang-tidy-review
on:
pull_request:
branches:
- master
jobs:
clang-tidy-review:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: ZedThree/clang-tidy-review@v0.13.0
id: review
with:
lgtm_comment_body: ''
build_dir: build
cmake_command: cmake . -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=on
split_workflow: true
- uses: ZedThree/clang-tidy-review/upload@v0.13.0

41
.gitignore vendored
View File

@@ -1,34 +1,59 @@
*.o
*.a
*.so
.DS_Store
.build/
.cache/
.direnv/
.envrc
.swiftpm
.venv
.clang-tidy
.vs/
.vscode/
.DS_Store
.build/
build/
build-em/
build-debug/
build-release/
build-static/
build-cublas/
build-opencl/
build-metal/
build-no-accel/
build-sanitize-addr/
build-sanitize-thread/
out/
models/*
*.bin
/main
/quantize
/quantize-stats
/result
/perplexity
/embedding
/train-text-from-scratch
/simple
/benchmark-matmult
/vdot
/server
/Pipfile
/embd-input-test
/libllama.so
build-info.h
arm_neon.h
compile_commands.json
CMakeSettings.json
.envrc
.direnv/
.venv
__pycache__
.swiftpm
zig-out/
zig-cache/
ppl-*.txt
qnt-*.txt
perf-*.txt
examples/jeopardy/results.txt

15
.pre-commit-config.yaml Normal file
View File

@@ -0,0 +1,15 @@
# See https://pre-commit.com for more information
# See https://pre-commit.com/hooks.html for more hooks
exclude: prompts/.*.txt
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v3.2.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
- id: check-added-large-files
- repo: https://github.com/PyCQA/flake8
rev: 6.0.0
hooks:
- id: flake8

View File

@@ -37,38 +37,90 @@ endif()
#
# general
option(LLAMA_STATIC "llama: static link libraries" OFF)
option(LLAMA_NATIVE "llama: enable -march=native flag" OFF)
option(LLAMA_LTO "llama: enable link time optimization" OFF)
option(LLAMA_STATIC "llama: static link libraries" OFF)
option(LLAMA_NATIVE "llama: enable -march=native flag" OFF)
option(LLAMA_LTO "llama: enable link time optimization" OFF)
# debug
option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON)
option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
option(LLAMA_GPROF "llama: enable gprof" OFF)
option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON)
option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
option(LLAMA_GPROF "llama: enable gprof" OFF)
# sanitizers
option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF)
option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF)
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF)
option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF)
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
# instruction set specific
option(LLAMA_AVX "llama: enable AVX" ON)
option(LLAMA_AVX2 "llama: enable AVX2" ON)
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
option(LLAMA_FMA "llama: enable FMA" ON)
option(LLAMA_AVX "llama: enable AVX" ON)
option(LLAMA_AVX2 "llama: enable AVX2" ON)
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF)
option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF)
option(LLAMA_FMA "llama: enable FMA" ON)
# in MSVC F16C is implied with AVX2/AVX512
if (NOT MSVC)
option(LLAMA_F16C "llama: enable F16C" ON)
endif()
# 3rd party libs
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
option(LLAMA_OPENBLAS "llama: use OpenBLAS" OFF)
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
option(LLAMA_BLAS "llama: use BLAS" OFF)
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_METAL "llama: use Metal" OFF)
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_SERVER "llama: build server example" ON)
#
# Build info header
#
# Generate initial build-info.h
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git")
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/.git")
# Is git submodule
if(NOT IS_DIRECTORY "${GIT_DIR}")
file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${REAL_GIT_DIR}")
endif()
# Add a custom target for build-info.h
add_custom_target(BUILD_INFO ALL DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h")
# Add a custom command to rebuild build-info.h when .git/index changes
add_custom_command(
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h"
COMMENT "Generating build details from Git"
COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake"
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
DEPENDS "${GIT_DIR}/index"
VERBATIM
)
else()
message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
endif()
#
# Compile flags
#
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED true)
set(CMAKE_C_STANDARD 11)
set(CMAKE_C_STANDARD_REQUIRED true)
set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED)
@@ -101,20 +153,163 @@ if (APPLE AND LLAMA_ACCELERATE)
message(WARNING "Accelerate framework not found")
endif()
endif()
if (LLAMA_OPENBLAS)
if (LLAMA_BLAS)
if (LLAMA_STATIC)
set(BLA_STATIC ON)
endif()
if ($(CMAKE_VERSION) VERSION_GREATER_EQUAL 3.22)
set(BLA_SIZEOF_INTEGER 8)
endif()
set(BLA_VENDOR OpenBLAS)
set(BLA_VENDOR ${LLAMA_BLAS_VENDOR})
find_package(BLAS)
if (BLAS_FOUND)
message(STATUS "OpenBLAS found")
if (BLAS_FOUND)
message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
if ("${BLAS_INCLUDE_DIRS}" STREQUAL "")
# BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
# see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
find_package(PkgConfig REQUIRED)
if (${LLAMA_BLAS_VENDOR} MATCHES "Generic")
pkg_check_modules(DepBLAS REQUIRED blas)
elseif (${LLAMA_BLAS_VENDOR} MATCHES "OpenBLAS")
pkg_check_modules(DepBLAS REQUIRED openblas)
elseif (${LLAMA_BLAS_VENDOR} MATCHES "FLAME")
pkg_check_modules(DepBLAS REQUIRED blis)
elseif (${LLAMA_BLAS_VENDOR} MATCHES "ATLAS")
pkg_check_modules(DepBLAS REQUIRED blas-atlas)
elseif (${LLAMA_BLAS_VENDOR} MATCHES "FlexiBLAS")
pkg_check_modules(DepBLAS REQUIRED flexiblas_api)
elseif (${LLAMA_BLAS_VENDOR} MATCHES "Intel")
# all Intel* libraries share the same include path
pkg_check_modules(DepBLAS REQUIRED mkl-sdl)
elseif (${LLAMA_BLAS_VENDOR} MATCHES "NVHPC")
# this doesn't provide pkg-config
# suggest to assign BLAS_INCLUDE_DIRS on your own
if ("${NVHPC_VERSION}" STREQUAL "")
message(WARNING "Better to set NVHPC_VERSION")
else()
set(DepBLAS_FOUND ON)
set(DepBLAS_INCLUDE_DIRS "/opt/nvidia/hpc_sdk/${CMAKE_SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR}/${NVHPC_VERSION}/math_libs/include")
endif()
endif()
if (DepBLAS_FOUND)
set(BLAS_INCLUDE_DIRS ${DepBLAS_INCLUDE_DIRS})
else()
message(WARNING "BLAS_INCLUDE_DIRS neither been provided nor been automatically"
" detected by pkgconfig, trying to find cblas.h from possible paths...")
find_path(BLAS_INCLUDE_DIRS
NAMES cblas.h
HINTS
/usr/include
/usr/local/include
/usr/include/openblas
/opt/homebrew/opt/openblas/include
/usr/local/opt/openblas/include
/usr/include/x86_64-linux-gnu/openblas/include
)
endif()
endif()
message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
add_compile_options(${BLAS_LINKER_FLAGS})
add_compile_definitions(GGML_USE_OPENBLAS)
add_link_options(${BLAS_LIBRARIES})
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
else()
message(WARNING "OpenBLAS not found")
message(WARNING "BLAS not found, please refer to "
"https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
" to set correct LLAMA_BLAS_VENDOR")
endif()
endif()
if (LLAMA_K_QUANTS)
set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h)
add_compile_definitions(GGML_USE_K_QUANTS)
if (LLAMA_QKK_64)
add_compile_definitions(GGML_QKK_64)
endif()
endif()
if (LLAMA_CUBLAS)
cmake_minimum_required(VERSION 3.17)
find_package(CUDAToolkit)
if (CUDAToolkit_FOUND)
message(STATUS "cuBLAS found")
enable_language(CUDA)
set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
add_compile_definitions(GGML_USE_CUBLAS)
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
if (LLAMA_CUDA_DMMV_F16)
add_compile_definitions(GGML_CUDA_DMMV_F16)
endif()
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
if (LLAMA_STATIC)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
else()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif()
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
if (LLAMA_CUDA_DMMV_F16)
set(CMAKE_CUDA_ARCHITECTURES "61") # needed for f16 CUDA intrinsics
else()
set(CMAKE_CUDA_ARCHITECTURES "52") # lowest CUDA 12 standard
endif()
endif()
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
else()
message(WARNING "cuBLAS not found")
endif()
endif()
if (LLAMA_METAL)
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
add_compile_definitions(GGML_USE_METAL)
add_compile_definitions(GGML_METAL_NDEBUG)
# get full path to the file
#add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
# copy ggml-metal.metal to bin directory
configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
${FOUNDATION_LIBRARY}
${METAL_FRAMEWORK}
${METALKIT_FRAMEWORK}
${METALPERFORMANCE_FRAMEWORK}
)
endif()
if (LLAMA_CLBLAST)
find_package(CLBlast)
if (CLBlast_FOUND)
message(STATUS "CLBlast found")
set(GGML_SOURCES_OPENCL ggml-opencl.cpp ggml-opencl.h)
add_compile_definitions(GGML_USE_CLBLAST)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} clblast)
else()
message(WARNING "CLBlast not found")
endif()
endif()
@@ -129,7 +324,6 @@ if (LLAMA_ALL_WARNINGS)
-Wshadow
-Wstrict-prototypes
-Wpointer-arith
-Wno-unused-function
)
set(cxx_flags
-Wall
@@ -137,6 +331,7 @@ if (LLAMA_ALL_WARNINGS)
-Wpedantic
-Wcast-qual
-Wno-unused-function
-Wno-multichar
)
else()
# todo : msvc
@@ -149,6 +344,14 @@ if (LLAMA_ALL_WARNINGS)
endif()
if (MSVC)
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
if (BUILD_SHARED_LIBS)
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
endif()
endif()
if (LLAMA_LTO)
include(CheckIPOSupported)
check_ipo_supported(RESULT result OUTPUT output)
@@ -183,23 +386,48 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES
if (MSVC)
# TODO: arm msvc?
else()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
add_compile_options(-mcpu=native)
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
# Raspberry Pi 1, Zero
add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access)
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
# Raspberry Pi 2
add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations)
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
# Raspberry Pi 3, 4, Zero 2 (32-bit)
add_compile_options(-mfp16-format=ieee -mno-unaligned-access)
endif()
# TODO: armv6,7,8 version specific flags
endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
message(STATUS "x86 detected")
if (MSVC)
if (LLAMA_AVX512)
add_compile_options(/arch:AVX512)
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX512>)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
# MSVC has no compile-time flags enabling specific
# AVX512 extensions, neither it defines the
# macros corresponding to the extensions.
# Do it manually.
if (LLAMA_AVX512_VBMI)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
endif()
if (LLAMA_AVX512_VNNI)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
endif()
elseif (LLAMA_AVX2)
add_compile_options(/arch:AVX2)
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX2>)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX2>)
elseif (LLAMA_AVX)
add_compile_options(/arch:AVX)
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX>)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX>)
endif()
else()
add_compile_options(-mf16c)
if (LLAMA_F16C)
add_compile_options(-mf16c)
endif()
if (LLAMA_FMA)
add_compile_options(-mfma)
endif()
@@ -211,13 +439,20 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
endif()
if (LLAMA_AVX512)
add_compile_options(-mavx512f)
# add_compile_options(-mavx512cd)
# add_compile_options(-mavx512dq)
# add_compile_options(-mavx512bw)
add_compile_options(-mavx512bw)
endif()
if (LLAMA_AVX512_VBMI)
add_compile_options(-mavx512vbmi)
endif()
if (LLAMA_AVX512_VNNI)
add_compile_options(-mavx512vnni)
endif()
endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
message(STATUS "PowerPC detected")
add_compile_options(-mcpu=native -mtune=native)
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
else()
# TODO: support PowerPC
message(STATUS "Unknown architecture")
endif()
@@ -227,36 +462,56 @@ endif()
add_library(ggml OBJECT
ggml.c
ggml.h)
ggml.h
${GGML_SOURCES_CUDA}
${GGML_SOURCES_OPENCL}
${GGML_SOURCES_METAL}
${GGML_SOURCES_EXTRA}
)
target_include_directories(ggml PUBLIC .)
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
target_compile_features(ggml PUBLIC c_std_11) # don't bump
target_link_libraries(ggml PRIVATE Threads::Threads ${LLAMA_EXTRA_LIBS})
target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
add_library(ggml_static STATIC $<TARGET_OBJECTS:ggml>)
if (BUILD_SHARED_LIBS)
set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
add_library(ggml_shared SHARED $<TARGET_OBJECTS:ggml>)
target_link_libraries(ggml_shared PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
endif()
add_library(llama
llama.cpp
llama.h)
llama.h
llama-util.h
)
target_include_directories(llama PUBLIC .)
target_compile_features(llama PUBLIC cxx_std_11) # don't bump
target_link_libraries(llama PRIVATE ggml ${LLAMA_EXTRA_LIBS})
target_link_libraries(llama PRIVATE
ggml
${LLAMA_EXTRA_LIBS}
)
if (BUILD_SHARED_LIBS)
set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
if (LLAMA_METAL)
set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
endif()
endif()
#
# programs, examples and tests
#
if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
enable_testing()
include(CTest)
add_subdirectory(tests)
endif ()
if (LLAMA_BUILD_EXAMPLES)
add_subdirectory(examples)
add_subdirectory(pocs)
endif()

315
Makefile
View File

@@ -1,3 +1,8 @@
# Define the default target now so that it is always the first target
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple server libembdinput.so embd-input-test
default: $(BUILD_TARGETS)
ifndef UNAME_S
UNAME_S := $(shell uname -s)
endif
@@ -31,13 +36,32 @@ endif
#
# keep standard at C11 and C++11
CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
# -Ofast tends to produce faster code, but may not be available for some compilers.
ifdef LLAMA_FAST
OPT = -Ofast
else
OPT = -O3
endif
CFLAGS = -I. $(OPT) -std=c11 -fPIC
CXXFLAGS = -I. -I./examples $(OPT) -std=c++11 -fPIC
LDFLAGS =
ifdef LLAMA_DEBUG
CFLAGS += -O0 -g
CXXFLAGS += -O0 -g
LDFLAGS += -g
else
CFLAGS += -DNDEBUG
CXXFLAGS += -DNDEBUG
endif
ifdef LLAMA_SERVER_VERBOSE
CXXFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
endif
# warnings
CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wno-unused-function
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
# OS specific
# TODO: support Windows
@@ -66,101 +90,36 @@ ifeq ($(UNAME_S),Haiku)
CXXFLAGS += -pthread
endif
ifdef LLAMA_GPROF
CFLAGS += -pg
CXXFLAGS += -pg
endif
ifdef LLAMA_PERF
CFLAGS += -DGGML_PERF
CXXFLAGS += -DGGML_PERF
endif
# Architecture specific
# TODO: probably these flags need to be tweaked on some architectures
# feel free to update the Makefile for your architecture and send a pull request or issue
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
ifeq ($(UNAME_S),Darwin)
CFLAGS += -mf16c
AVX1_M := $(shell sysctl machdep.cpu.features)
ifneq (,$(findstring FMA,$(AVX1_M)))
CFLAGS += -mfma
endif
ifneq (,$(findstring AVX1.0,$(AVX1_M)))
CFLAGS += -mavx
endif
AVX2_M := $(shell sysctl machdep.cpu.leaf7_features)
ifneq (,$(findstring AVX2,$(AVX2_M)))
CFLAGS += -mavx2
endif
else ifeq ($(UNAME_S),Linux)
AVX1_M := $(shell grep "avx " /proc/cpuinfo)
ifneq (,$(findstring avx,$(AVX1_M)))
CFLAGS += -mavx
endif
AVX2_M := $(shell grep "avx2 " /proc/cpuinfo)
ifneq (,$(findstring avx2,$(AVX2_M)))
CFLAGS += -mavx2
endif
FMA_M := $(shell grep "fma " /proc/cpuinfo)
ifneq (,$(findstring fma,$(FMA_M)))
CFLAGS += -mfma
endif
F16C_M := $(shell grep "f16c " /proc/cpuinfo)
ifneq (,$(findstring f16c,$(F16C_M)))
CFLAGS += -mf16c
endif
SSE3_M := $(shell grep "sse3 " /proc/cpuinfo)
ifneq (,$(findstring sse3,$(SSE3_M)))
CFLAGS += -msse3
endif
AVX512F_M := $(shell grep "avx512f " /proc/cpuinfo)
ifneq (,$(findstring avx512f,$(AVX512F_M)))
CFLAGS += -mavx512f
endif
AVX512BW_M := $(shell grep "avx512bw " /proc/cpuinfo)
ifneq (,$(findstring avx512bw,$(AVX512BW_M)))
CFLAGS += -mavx512bw
endif
AVX512DQ_M := $(shell grep "avx512dq " /proc/cpuinfo)
ifneq (,$(findstring avx512dq,$(AVX512DQ_M)))
CFLAGS += -mavx512dq
endif
AVX512VL_M := $(shell grep "avx512vl " /proc/cpuinfo)
ifneq (,$(findstring avx512vl,$(AVX512VL_M)))
CFLAGS += -mavx512vl
endif
AVX512CD_M := $(shell grep "avx512cd " /proc/cpuinfo)
ifneq (,$(findstring avx512cd,$(AVX512CD_M)))
CFLAGS += -mavx512cd
endif
AVX512ER_M := $(shell grep "avx512er " /proc/cpuinfo)
ifneq (,$(findstring avx512er,$(AVX512ER_M)))
CFLAGS += -mavx512er
endif
AVX512IFMA_M := $(shell grep "avx512ifma " /proc/cpuinfo)
ifneq (,$(findstring avx512ifma,$(AVX512IFMA_M)))
CFLAGS += -mavx512ifma
endif
AVX512PF_M := $(shell grep "avx512pf " /proc/cpuinfo)
ifneq (,$(findstring avx512pf,$(AVX512PF_M)))
CFLAGS += -mavx512pf
endif
else ifeq ($(UNAME_S),Haiku)
AVX1_M := $(shell sysinfo -cpu | grep -w "AVX")
ifneq (,$(findstring AVX,$(AVX1_M)))
CFLAGS += -mavx
endif
AVX2_M := $(shell sysinfo -cpu | grep -w "AVX2")
ifneq (,$(findstring AVX2,$(AVX2_M)))
CFLAGS += -mavx2
endif
FMA_M := $(shell sysinfo -cpu | grep -w "FMA")
ifneq (,$(findstring FMA,$(FMA_M)))
CFLAGS += -mfma
endif
F16C_M := $(shell sysinfo -cpu | grep -w "F16C")
ifneq (,$(findstring F16C,$(F16C_M)))
CFLAGS += -mf16c
endif
else
CFLAGS += -mfma -mf16c -mavx -mavx2
endif
# Use all CPU extensions that are available:
CFLAGS += -march=native -mtune=native
CXXFLAGS += -march=native -mtune=native
# Usage AVX-only
#CFLAGS += -mfma -mf16c -mavx
#CXXFLAGS += -mfma -mf16c -mavx
# Usage SSSE3-only (Not is SSE3!)
#CFLAGS += -mssse3
#CXXFLAGS += -mssse3
endif
ifneq ($(filter ppc64%,$(UNAME_M)),)
POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
ifneq (,$(findstring POWER9,$(POWER9_M)))
CFLAGS += -mcpu=power9
CFLAGS += -mcpu=power9
CXXFLAGS += -mcpu=power9
endif
# Require c++23's std::byteswap for big-endian support.
@@ -168,6 +127,17 @@ ifneq ($(filter ppc64%,$(UNAME_M)),)
CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN
endif
endif
ifndef LLAMA_NO_K_QUANTS
CFLAGS += -DGGML_USE_K_QUANTS
CXXFLAGS += -DGGML_USE_K_QUANTS
OBJS += k_quants.o
ifdef LLAMA_QKK_64
CFLAGS += -DGGML_QKK_64
CXXFLAGS += -DGGML_QKK_64
endif
endif
ifndef LLAMA_NO_ACCELERATE
# Mac M1 - include Accelerate framework.
# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
@@ -175,32 +145,99 @@ ifndef LLAMA_NO_ACCELERATE
CFLAGS += -DGGML_USE_ACCELERATE
LDFLAGS += -framework Accelerate
endif
endif
endif # LLAMA_NO_ACCELERATE
ifdef LLAMA_OPENBLAS
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
LDFLAGS += -lopenblas
endif # LLAMA_OPENBLAS
ifdef LLAMA_BLIS
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
LDFLAGS += -lblis -L/usr/local/lib
endif # LLAMA_BLIS
ifdef LLAMA_CUBLAS
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
OBJS += ggml-cuda.o
NVCC = nvcc
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
ifdef LLAMA_CUDA_DMMV_X
NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
else
NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
endif # LLAMA_CUDA_DMMV_X
ifdef LLAMA_CUDA_DMMV_Y
NVCCFLAGS += -DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y)
else
NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1
endif # LLAMA_CUDA_DMMV_Y
ifdef LLAMA_CUDA_DMMV_F16
NVCCFLAGS += -DGGML_CUDA_DMMV_F16
endif # LLAMA_CUDA_DMMV_F16
ifdef LLAMA_CUDA_KQUANTS_ITER
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
else
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
endif
ifdef LLAMA_GPROF
CFLAGS += -pg
CXXFLAGS += -pg
endif
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
endif # LLAMA_CUBLAS
ifdef LLAMA_CLBLAST
CFLAGS += -DGGML_USE_CLBLAST
CXXFLAGS += -DGGML_USE_CLBLAST
# Mac provides OpenCL as a framework
ifeq ($(UNAME_S),Darwin)
LDFLAGS += -lclblast -framework OpenCL
else
LDFLAGS += -lclblast -lOpenCL
endif
OBJS += ggml-opencl.o
ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
$(CXX) $(CXXFLAGS) -c $< -o $@
endif # LLAMA_CLBLAST
ifdef LLAMA_METAL
CFLAGS += -DGGML_USE_METAL -DGGML_METAL_NDEBUG
CXXFLAGS += -DGGML_USE_METAL
LDFLAGS += -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
OBJS += ggml-metal.o
ggml-metal.o: ggml-metal.m ggml-metal.h
$(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_METAL
ifneq ($(filter aarch64%,$(UNAME_M)),)
CFLAGS += -mcpu=native
# Apple M1, M2, etc.
# Raspberry Pi 3, 4, Zero 2 (64-bit)
CFLAGS += -mcpu=native
CXXFLAGS += -mcpu=native
endif
ifneq ($(filter armv6%,$(UNAME_M)),)
# Raspberry Pi 1, 2, 3
# Raspberry Pi 1, Zero
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
endif
ifneq ($(filter armv7%,$(UNAME_M)),)
# Raspberry Pi 4
# Raspberry Pi 2
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
endif
ifneq ($(filter armv8%,$(UNAME_M)),)
# Raspberry Pi 4
# Raspberry Pi 3, 4, Zero 2 (32-bit)
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
endif
ifdef LLAMA_NO_K_QUANTS
k_quants.o: k_quants.c k_quants.h
$(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_NO_K_QUANTS
#
# Print build information
#
@@ -216,43 +253,85 @@ $(info I CC: $(CCV))
$(info I CXX: $(CXXV))
$(info )
default: main quantize perplexity embedding
#
# Build library
#
ggml.o: ggml.c ggml.h
$(CC) $(CFLAGS) -c ggml.c -o ggml.o
ggml.o: ggml.c ggml.h ggml-cuda.h
$(CC) $(CFLAGS) -c $< -o $@
llama.o: llama.cpp llama.h
$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o
llama.o: llama.cpp ggml.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
$(CXX) $(CXXFLAGS) -c $< -o $@
common.o: examples/common.cpp examples/common.h
$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o
$(CXX) $(CXXFLAGS) -c $< -o $@
libllama.so: llama.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
clean:
rm -vf *.o main quantize perplexity embedding
rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch embd-input-test build-info.h
main: examples/main/main.cpp ggml.o llama.o common.o
$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
#
# Examples
#
main: examples/main/main.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@echo
@echo '==== Run ./main -h for help. ===='
@echo
quantize: examples/quantize/quantize.cpp ggml.o llama.o
$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS)
simple: examples/simple/simple.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
$(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS)
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
$(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS)
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
libembdinput.so: examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
embd-input-test: libembdinput.so examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.so,$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
build-info.h: $(wildcard .git/index) scripts/build-info.sh
@sh scripts/build-info.sh > $@.tmp
@if ! cmp -s $@.tmp $@; then \
mv $@.tmp $@; \
else \
rm $@.tmp; \
fi
#
# Tests
#
.PHONY: tests
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
./$@
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
.PHONY: tests clean
tests:
bash ./tests/run-tests.sh

View File

@@ -11,9 +11,13 @@ let package = Package(
.target(
name: "llama",
path: ".",
exclude: ["ggml-metal.metal"],
sources: ["ggml.c", "llama.cpp"],
publicHeadersPath: "spm-headers",
cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"])]
cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")],
linkerSettings: [
.linkedFramework("Accelerate")
]
),
],
cxxLanguageStandard: .cxx11

578
README.md
View File

@@ -1,42 +1,107 @@
# llama.cpp
![llama](https://user-images.githubusercontent.com/1991296/227761327-6d83e30e-2200-41a6-bfbb-f575231c54f4.png)
![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
[![Actions Status](https://github.com/ggerganov/llama.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/llama.cpp/actions)
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
**Hot topics:**
- [Roadmap (short-term)](https://github.com/ggerganov/llama.cpp/discussions/457)
- New C-style API is now available: https://github.com/ggerganov/llama.cpp/pull/370
- Cache input prompts for faster initialization: https://github.com/ggerganov/llama.cpp/issues/64
- Create a `llama.cpp` logo: https://github.com/ggerganov/llama.cpp/issues/105
- Simple web chat example: https://github.com/ggerganov/llama.cpp/pull/1998
- k-quants now support super-block size of 64: https://github.com/ggerganov/llama.cpp/pull/2001
- New roadmap: https://github.com/users/ggerganov/projects/7
- Azure CI brainstorming: https://github.com/ggerganov/llama.cpp/discussions/1985
- p1 : LLM-based code completion engine at the edge : https://github.com/ggml-org/p1/discussions/1
<details>
<summary>Table of Contents</summary>
<ol>
<li>
<a href="#description">Description</a>
</li>
<li>
<a href="#usage">Usage</a>
<ul>
<li><a href="#get-the-code">Get the Code</a></li>
<li><a href="#build">Build</a></li>
<li><a href="#blas-build">BLAS Build</a></li>
<li><a href="#prepare-data--run">Prepare Data & Run</a></li>
<li><a href="#memorydisk-requirements">Memory/Disk Requirements</a></li>
<li><a href="#quantization">Quantization</a></li>
<li><a href="#interactive-mode">Interactive mode</a></li>
<li><a href="#instruction-mode-with-alpaca">Instruction mode with Alpaca</a></li>
<li><a href="#using-openllama">Using OpenLLaMA</a></li>
<li><a href="#using-gpt4all">Using GPT4All</a></li>
<li><a href="#using-pygmalion-7b--metharme-7b">Using Pygmalion 7B & Metharme 7B</a></li>
<li><a href="#obtaining-the-facebook-llama-original-model-and-stanford-alpaca-model-data">Obtaining the Facebook LLaMA original model and Stanford Alpaca model data</a></li>
<li><a href="#verifying-the-model-files">Verifying the model files</a></li>
<li><a href="#seminal-papers-and-background-on-the-models">Seminal papers and background on the models</a></li>
<li><a href="#perplexity-measuring-model-quality">Perplexity (measuring model quality)</a></li>
<li><a href="#android">Android</a></li>
<li><a href="#docker">Docker</a></li>
</ul>
</li>
<li><a href="#contributing">Contributing</a></li>
<li><a href="#coding-guidelines">Coding guidelines</a></li>
<li><a href="#docs">Docs</a></li>
</ol>
</details>
## Description
The main goal is to run the model using 4-bit quantization on a MacBook
The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quantization on a MacBook
- Plain C/C++ implementation without dependencies
- Apple silicon first-class citizen - optimized via ARM NEON and Accelerate framework
- AVX2 support for x86 architectures
- Apple silicon first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
- AVX, AVX2 and AVX512 support for x86 architectures
- Mixed F16 / F32 precision
- 4-bit quantization support
- Runs on the CPU
- 4-bit, 5-bit and 8-bit integer quantization support
- Supports OpenBLAS/Apple BLAS/ARM Performance Lib/ATLAS/BLIS/Intel MKL/NVHPC/ACML/SCSL/SGIMATH and [more](https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors) in BLAS
- cuBLAS and CLBlast support
This was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022) - I have no idea if it works correctly.
Please do not make conclusions about the models based on the results from this implementation.
For all I know, it can be completely wrong. This project is for educational purposes.
New features will probably be added mostly through community contributions.
The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022).
Since then, the project has improved significantly thanks to many contributions. This project is for educational purposes and serves
as the main playground for developing new features for the [ggml](https://github.com/ggerganov/ggml) library.
Supported platforms:
**Supported platforms:**
- [X] Mac OS
- [X] Linux
- [X] Windows (via CMake)
- [X] Docker
**Supported models:**
- [X] LLaMA 🦙
- [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
- [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all)
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
- [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894)
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
- [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy)
- [X] [Pygmalion 7B / Metharme 7B](#using-pygmalion-7b--metharme-7b)
- [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
- [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B)
**Bindings:**
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
- Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
**UI:**
- [nat/openplayground](https://github.com/nat/openplayground)
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui)
---
Here is a typical run using LLaMA-7B:
@@ -131,69 +196,333 @@ https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8
## Usage
Here are the step for the LLaMA-7B model:
Here are the steps for the LLaMA-7B model.
### Get the Code
```bash
# build this repo
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
make
```
### Build
In order to build llama.cpp you have three different options.
- Using `make`:
- On Linux or MacOS:
```bash
make
```
- On Windows:
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
2. Extract `w64devkit` on your pc.
3. Run `w64devkit.exe`.
4. Use the `cd` command to reach the `llama.cpp` folder.
5. From here you can run:
```bash
make
```
- Using `CMake`:
```bash
mkdir build
cd build
cmake ..
cmake --build . --config Release
```
- Using `Zig`:
```bash
zig build -Drelease-fast
```
### Metal Build
Using Metal allows the computation to be executed on the GPU for Apple devices:
- Using `make`:
```bash
LLAMA_METAL=1 make
```
- Using `CMake`:
```bash
mkdir build-metal
cd build-metal
cmake -DLLAMA_METAL=ON ..
cmake --build . --config Release
```
When built with Metal support, you can enable GPU inference with the `--gpu-layers|-ngl` command-line argument.
Any value larger than 0 will offload the computation to the GPU. For example:
```bash
./main -m ./models/7B/ggml-model-q4_0.bin -n 128 -ngl 1
```
### BLAS Build
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:
- #### Accelerate Framework:
This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions.
- #### OpenBLAS:
This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine.
- Using `make`:
- On Linux:
```bash
make LLAMA_OPENBLAS=1
```
- On Windows:
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
2. Download the latest version of [OpenBLAS for Windows](https://github.com/xianyi/OpenBLAS/releases).
3. Extract `w64devkit` on your pc.
4. From the OpenBLAS zip that you just downloaded copy `libopenblas.a`, located inside the `lib` folder, inside `w64devkit\x86_64-w64-mingw32\lib`.
5. From the same OpenBLAS zip copy the content of the `include` folder inside `w64devkit\x86_64-w64-mingw32\include`.
6. Run `w64devkit.exe`.
7. Use the `cd` command to reach the `llama.cpp` folder.
8. From here you can run:
```bash
make LLAMA_OPENBLAS=1
```
- Using `CMake` on Linux:
```bash
mkdir build
cd build
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
cmake --build . --config Release
```
- #### BLIS
Check [BLIS.md](docs/BLIS.md) for more information.
- #### Intel MKL
By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. You may also specify it by:
```bash
mkdir build
cd build
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
cmake --build . --config Release
```
- #### cuBLAS
This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
- Using `make`:
```bash
make LLAMA_CUBLAS=1
```
- Using `CMake`:
```bash
mkdir build
cd build
cmake .. -DLLAMA_CUBLAS=ON
cmake --build . --config Release
```
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
| Option | Legal values | Default | Description |
|-------------------------|------------------------|---------|-------------|
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
| LLAMA_CUDA_DMMV_Y | Positive integer | 1 | Block size in y direction for the CUDA dequantization + mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
| LLAMA_CUDA_DMMV_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. |
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
- #### CLBlast
OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.
You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
- For Ubuntu or Debian, the packages `opencl-headers`, `ocl-icd` may be needed.
- <details>
<summary>Installing the OpenCL SDK from source</summary>
```sh
git clone --recurse-submodules https://github.com/KhronosGroup/OpenCL-SDK.git
mkdir OpenCL-SDK/build
cd OpenCL-SDK/build
cmake .. -DBUILD_DOCS=OFF \
-DBUILD_EXAMPLES=OFF \
-DBUILD_TESTING=OFF \
-DOPENCL_SDK_BUILD_SAMPLES=OFF \
-DOPENCL_SDK_TEST_SAMPLES=OFF
cmake --build . --config Release
cmake --install . --prefix /some/path
```
</details>
Installing CLBlast: it may be found in your operating system's packages.
- <details>
<summary>If not, then installing from source:</summary>
```sh
git clone https://github.com/CNugteren/CLBlast.git
mkdir CLBlast/build
cd CLBlast/build
cmake .. -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
cmake --build . --config Release
cmake --install . --prefix /some/path
```
Where `/some/path` is where the built library will be installed (default is `/usr/local`).
</details>
Building:
- Build with make:
```sh
make LLAMA_CLBLAST=1
```
- CMake:
```sh
mkdir build
cd build
cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_dir=/some/path
cmake --build . --config Release
```
Running:
The CLBlast build supports `--gpu-layers|-ngl` like the CUDA version does.
To select the correct platform (driver) and device (GPU), you can use the environment variables `GGML_OPENCL_PLATFORM` and `GGML_OPENCL_DEVICE`.
The selection can be a number (starting from 0) or a text string to search:
```sh
GGML_OPENCL_PLATFORM=1 ./main ...
GGML_OPENCL_DEVICE=2 ./main ...
GGML_OPENCL_PLATFORM=Intel ./main ...
GGML_OPENCL_PLATFORM=AMD GGML_OPENCL_DEVICE=1 ./main ...
```
The default behavior is to find the first GPU device, but when it is an integrated GPU on a laptop, for instance, the selectors are useful.
Using the variables it is possible to select a CPU-based driver as well, if so desired.
You can get a list of platforms and devices from the `clinfo -l` command, etc.
### Prepare Data & Run
```bash
# obtain the original LLaMA model weights and place them in ./models
ls ./models
65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
# install Python dependencies
python3 -m pip install torch numpy sentencepiece
python3 -m pip install -r requirements.txt
# convert the 7B model to ggml FP16 format
python3 convert-pth-to-ggml.py models/7B/ 1
python3 convert.py models/7B/
# quantize the model to 4-bits
python3 quantize.py 7B
# quantize the model to 4-bits (using q4_0 method)
./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin q4_0
# run the inference
./main -m ./models/7B/ggml-model-q4_0.bin -n 128
```
Currently, it's best to use Python 3.9 or Python 3.10, as `sentencepiece` has not yet published a wheel for Python 3.11.
When running the larger models, make sure you have enough disk space to store all the intermediate files.
### Memory/Disk Requirements
As the models are currently fully loaded into memory, you will need adequate disk space to save them
and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
| model | original size | quantized size (4-bit) |
|-------|---------------|------------------------|
| 7B | 13 GB | 3.9 GB |
| 13B | 24 GB | 7.8 GB |
| 30B | 60 GB | 19.5 GB |
| 65B | 120 GB | 38.5 GB |
| Model | Original size | Quantized size (4-bit) |
|------:|--------------:|-----------------------:|
| 7B | 13 GB | 3.9 GB |
| 13B | 24 GB | 7.8 GB |
| 30B | 60 GB | 19.5 GB |
| 65B | 120 GB | 38.5 GB |
### Quantization
Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
| 7B | perplexity | 5.9066 | 6.1565 | 6.0912 | 5.9862 | 5.9481 | 5.9070 |
| 7B | file size | 13.0G | 3.5G | 3.9G | 4.3G | 4.7G | 6.7G |
| 7B | ms/tok @ 4th | 127 | 55 | 54 | 76 | 83 | 72 |
| 7B | ms/tok @ 8th | 122 | 43 | 45 | 52 | 56 | 67 |
| 7B | bits/weight | 16.0 | 4.5 | 5.0 | 5.5 | 6.0 | 8.5 |
| 13B | perplexity | 5.2543 | 5.3860 | 5.3608 | 5.2856 | 5.2706 | 5.2548 |
| 13B | file size | 25.0G | 6.8G | 7.6G | 8.3G | 9.1G | 13G |
| 13B | ms/tok @ 4th | - | 103 | 105 | 148 | 160 | 131 |
| 13B | ms/tok @ 8th | - | 73 | 82 | 98 | 105 | 128 |
| 13B | bits/weight | 16.0 | 4.5 | 5.0 | 5.5 | 6.0 | 8.5 |
### Perplexity (measuring model quality)
You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better).
For more information, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity).
The perplexity measurements in table above are done against the `wikitext2` test dataset (https://paperswithcode.com/dataset/wikitext-2), with context length of 512.
The time per token is measured on a MacBook M1 Pro 32GB RAM using 4 and 8 threads.
### Interactive mode
If you want a more ChatGPT-like experience, you can run in interactive mode by passing `-i` as a parameter.
In this mode, you can always interrupt generation by pressing Ctrl+C and enter one or more lines of text which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt which makes LLaMa emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMa emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
Here is an example few-shot interaction, invoked with the command
Here is an example of a few-shot interaction, invoked with the command
```bash
# default arguments using 7B model
# default arguments using a 7B model
./examples/chat.sh
# advanced chat with 13B model
# advanced chat with a 13B model
./examples/chat-13B.sh
# custom arguments using 13B model
# custom arguments using a 13B model
./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
```
Note the use of `--color` to distinguish between user input and generated text.
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program.
![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png)
### Persistent Interaction
The prompt, user inputs, and model generations can be saved and resumed across calls to `./main` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
```bash
# Start a new chat
PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
# Resume that chat
PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
# Start a different chat with the same prompt/model
PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/another ./examples/chat-persistent.sh
# Different prompt cache for different prompt/model
PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
CHAT_SAVE_DIR=./chat/bob ./examples/chat-persistent.sh
```
### Instruction mode with Alpaca
1. First, download the `ggml` Alpaca model into the `./models` folder
@@ -219,54 +548,76 @@ There 26 letters in the English Alphabet
The majority (54%) are using public transit. This includes buses, trams and metros with over 100 lines throughout the city which make it very accessible for tourists to navigate around town as well as locals who commute by tram or metro on a daily basis
> List 5 words that start with "ca".
cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
>
>
```
### Obtaining and verifying the Facebook LLaMA original model and Stanford Alpaca model data
### Using [OpenLLaMA](https://github.com/openlm-research/open_llama)
- **Under no circumstances share IPFS, magnet links, or any other links to model downloads anywhere in this respository, including in issues, discussions or pull requests. They will be immediately deleted.**
- The LLaMA models are officially distributed by Facebook and will **never** be provided through this repository.
OpenLLaMA is an openly licensed reproduction of Meta's original LLaMA model. It uses the same architecture and is a drop-in replacement for the original LLaMA weights.
- Download the [3B](https://huggingface.co/openlm-research/open_llama_3b), [7B](https://huggingface.co/openlm-research/open_llama_7b), or [13B](https://huggingface.co/openlm-research/open_llama_13b) model from Hugging Face.
- Convert the model to ggml FP16 format using `python convert.py <path to OpenLLaMA directory>`
### Using [GPT4All](https://github.com/nomic-ai/gpt4all)
- Obtain the `tokenizer.model` file from LLaMA model and put it to `models`
- Obtain the `added_tokens.json` file from Alpaca model and put it to `models`
- Obtain the `gpt4all-lora-quantized.bin` file from GPT4All model and put it to `models/gpt4all-7B`
- It is distributed in the old `ggml` format which is now obsoleted
- You have to convert it to the new format using `convert.py`:
```bash
python3 convert.py models/gpt4all-7B/gpt4all-lora-quantized.bin
```
- You can now use the newly generated `models/gpt4all-7B/ggml-model-q4_0.bin` model in exactly the same way as all other models
- The newer GPT4All-J model is not yet supported!
### Using Pygmalion 7B & Metharme 7B
- Obtain the [LLaMA weights](#obtaining-the-facebook-llama-original-model-and-stanford-alpaca-model-data)
- Obtain the [Pygmalion 7B](https://huggingface.co/PygmalionAI/pygmalion-7b/) or [Metharme 7B](https://huggingface.co/PygmalionAI/metharme-7b) XOR encoded weights
- Convert the LLaMA model with [the latest HF convert script](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py)
- Merge the XOR files with the converted LLaMA weights by running the [xor_codec](https://huggingface.co/PygmalionAI/pygmalion-7b/blob/main/xor_codec.py) script
- Convert to `ggml` format using the `convert.py` script in this repo:
```bash
python3 convert.py pygmalion-7b/ --outtype q4_1
```
> The Pygmalion 7B & Metharme 7B weights are saved in [bfloat16](https://en.wikipedia.org/wiki/Bfloat16_floating-point_format) precision. If you wish to convert to `ggml` without quantizating, please specify the `--outtype` as `f32` instead of `f16`.
### Obtaining the Facebook LLaMA original model and Stanford Alpaca model data
- **Under no circumstances should IPFS, magnet links, or any other links to model downloads be shared anywhere in this repository, including in issues, discussions, or pull requests. They will be immediately deleted.**
- The LLaMA models are officially distributed by Facebook and will **never** be provided through this repository.
- Refer to [Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to request access to the model data.
- Please verify the sha256 checksums of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files.
- The following command will verify if you have all possible latest files in your self-installed `./models` subdirectory:
`sha256sum --ignore-missing -c SHA256SUMS` on Linux
### Verifying the model files
or
Please verify the [sha256 checksums](SHA256SUMS) of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files.
- The following python script will verify if you have all possible latest files in your self-installed `./models` subdirectory:
`shasum -a 256 --ignore-missing -c SHA256SUMS` on macOS
```bash
# run the verification script
python3 .\scripts\verify-checksum-models.py
```
- If your issue is with model generation quality then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
- LLaMA:
- On linux or macOS it is also possible to run the following commands to verify if you have all possible latest files in your self-installed `./models` subdirectory:
- On Linux: `sha256sum --ignore-missing -c SHA256SUMS`
- on macOS: `shasum -a 256 --ignore-missing -c SHA256SUMS`
### Seminal papers and background on the models
If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
- LLaMA:
- [Introducing LLaMA: A foundational, 65-billion-parameter large language model](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/)
- [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)
- GPT-3
- GPT-3
- [Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165)
- GPT-3.5 / InstructGPT / ChatGPT:
- GPT-3.5 / InstructGPT / ChatGPT:
- [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
- [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
### Perplexity (Measuring model quality)
You can use the `perplexity` example to measure perplexity over the given prompt. For more background,
see https://huggingface.co/docs/transformers/perplexity. However, in general, lower perplexity is better for LLMs.
#### Latest measurements
The latest perplexity scores for the various model sizes and quantizations are being tracked in [discussion #406](https://github.com/ggerganov/llama.cpp/discussions/406). `llama.cpp` is measuring very well
compared to the baseline implementations. Quantization has a small negative impact to quality, but, as you can see, running
13B at q4_0 beats the 7B f16 model by a significant amount.
All measurements are done against wikitext2 test dataset (https://paperswithcode.com/dataset/wikitext-2), with default options (512 length context).
Note that the changing the context length will have a significant impact on perplexity (longer context = better perplexity).
```
Perplexity - model options
5.5985 - 13B, q4_0
5.9565 - 7B, f16
6.3001 - 7B, q4_1
6.5949 - 7B, q4_0
6.5995 - 7B, q4_0, --memory_f16
```
#### How to run
@@ -282,8 +633,14 @@ And after 4.45 hours, you will have the final perplexity.
### Android
You can easily run `llama.cpp` on Android device with [termux](https://play.google.com/store/apps/details?id=com.termux).
First, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake:
#### Building the Project using Android NDK
You can easily run `llama.cpp` on Android device with [termux](https://termux.dev/).
First, install the essential packages for termux:
```
pkg install clang wget git cmake
```
Second, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake:
```
$ mkdir build-android
$ cd build-android
@@ -291,16 +648,59 @@ $ export NDK=<your_ndk_directory>
$ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
$ make
```
Install [termux](https://play.google.com/store/apps/details?id=com.termux) on your device and run `termux-setup-storage` to get access to your SD card.
Install [termux](https://termux.dev/) on your device and run `termux-setup-storage` to get access to your SD card.
Finally, copy the `llama` binary and the model files to your device storage. Here is a demo of an interactive session running on Pixel 5 phone:
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
#### Building the Project using Termux (F-Droid)
Termux from F-Droid offers an alternative route to execute the project on an Android device. This method empowers you to construct the project right from within the terminal, negating the requirement for a rooted device or SD Card.
Outlined below are the directives for installing the project using OpenBLAS and CLBlast. This combination is specifically designed to deliver peak performance on recent devices that feature a GPU.
If you opt to utilize OpenBLAS, you'll need to install the corresponding package.
```
apt install libopenblas
```
Subsequently, if you decide to incorporate CLBlast, you'll first need to install the requisite OpenCL packages:
```
apt install ocl-icd opencl-headers opencl-clhpp clinfo
```
In order to compile CLBlast, you'll need to first clone the respective Git repository, which can be found at this URL: https://github.com/CNugteren/CLBlast. Alongside this, clone this repository into your home directory. Once this is done, navigate to the CLBlast folder and execute the commands detailed below:
```
cmake .
make
cp libclblast.so* $PREFIX/lib
cp ./include/clblast.h ../llama.cpp
```
Following the previous steps, navigate to the LlamaCpp directory. To compile it with OpenBLAS and CLBlast, execute the command provided below:
```
cp /data/data/com.termux/files/usr/include/openblas/cblas.h .
cp /data/data/com.termux/files/usr/include/openblas/openblas_config.h .
make LLAMA_CLBLAST=1 //(sometimes you need to run this command twice)
```
Upon completion of the aforementioned steps, you will have successfully compiled the project. To run it using CLBlast, a slight adjustment is required: a command must be issued to direct the operations towards your device's physical GPU, rather than the virtual one. The necessary command is detailed below:
```
GGML_OPENCL_PLATFORM=0
GGML_OPENCL_DEVICE=0
export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH
```
(Note: some Android devices, like the Zenfone 8, need the following command instead - "export LD_LIBRARY_PATH=/system/vendor/lib64:$LD_LIBRARY_PATH". Source: https://www.reddit.com/r/termux/comments/kc3ynp/opencl_working_in_termux_more_in_comments/ )
For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle.
Place your desired model into the `/llama.cpp/models/` directory and execute the `./main (...)` script.
### Docker
#### Prerequisites
* Docker must be installed and running on your system.
* Create a folder to store big models & intermediate files (in ex. im using /llama/models)
* Create a folder to store big models & intermediate files (ex. /llama/models)
#### Images
We have two Docker images available for this project:
@@ -312,20 +712,22 @@ We have two Docker images available for this project:
The easiest way to download the models, convert them to ggml and optimize them is with the --all-in-one command which includes the full docker image.
```bash
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
```
On complete, you are ready to play!
Replace `/path/to/models` below with the actual path where you downloaded the models.
```bash
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
```
or with light image:
On completion, you are ready to play!
```bash
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
```
or with a light image:
```bash
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
```
### Contributing
@@ -343,6 +745,10 @@ docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models
- Always consider cross-compatibility with other operating systems and architectures
- Avoid fancy looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
- There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
- Clean-up any trailing whitespaces, use 4 spaces indentation, brackets on same line, `void * ptr`, `int & a`
- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
### Docs
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
- [Performance troubleshooting](./docs/token_generation_performance_tips.md)

View File

@@ -1,12 +1,27 @@
700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d models/7B/consolidated.00.pth
666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847 models/7B/ggml-model-f16.bin
ec2f2d1f0dfb73b72a4cbac7fa121abbe04c37ab327125a38248f930c0f09ddf models/7B/ggml-model-q4_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q4_1.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q5_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q5_1.bin
7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265 models/7B/params.json
745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08 models/13B/consolidated.00.pth
d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085 models/13B/consolidated.01.pth
2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808 models/13B/ggml-model-f16.bin
fad169e6f0f575402cf75945961cb4a8ecd824ba4da6be2af831f320c4348fa5 models/13B/ggml-model-q4_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q4_1.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q5_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q5_1.bin
4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f models/13B/params.json
e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067 models/30B/consolidated.00.pth
4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff models/30B/consolidated.01.pth
24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378 models/30B/consolidated.02.pth
1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b models/30B/consolidated.03.pth
7e1b524061a9f4b27c22a12d6d2a5bf13b8ebbea73e99f218809351ed9cf7d37 models/30B/ggml-model-f16.bin
d2a441403944819492ec8c2002cc36fa38468149bfb4b7b4c52afc7bd9a7166d models/30B/ggml-model-q4_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q4_1.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q5_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q5_1.bin
2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb models/30B/params.json
135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe models/65B/consolidated.00.pth
9a600b37b19d38c7e43809485f70d17d1dc12206c07efa83bc72bb498a568bde models/65B/consolidated.01.pth
@@ -16,5 +31,10 @@ e7babf7c5606f165a3756f527cb0fedc4f83e67ef1290391e52fb1cce5f26770 models/65B/con
a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78 models/65B/consolidated.05.pth
72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b models/65B/consolidated.06.pth
d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638 models/65B/consolidated.07.pth
60758f2384d74e423dffddfd020ffed9d3bb186ebc54506f9c4a787d0f5367b0 models/65B/ggml-model-f16.bin
cde053439fa4910ae454407e2717cc46cc2c2b4995c00c93297a2b52e790fa92 models/65B/ggml-model-q4_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q4_1.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q5_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q5_1.bin
999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b models/65B/params.json
9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 models/tokenizer.model

58
build.zig Normal file
View File

@@ -0,0 +1,58 @@
const std = @import("std");
// Zig Version: 0.11.0-dev.3379+629f0d23b
pub fn build(b: *std.build.Builder) void {
const target = b.standardTargetOptions(.{});
const optimize = b.standardOptimizeOption(.{});
const lib = b.addStaticLibrary(.{
.name = "llama",
.target = target,
.optimize = optimize,
});
lib.linkLibC();
lib.linkLibCpp();
lib.addIncludePath(".");
lib.addIncludePath("./examples");
lib.addCSourceFiles(&.{
"ggml.c",
}, &.{"-std=c11"});
lib.addCSourceFiles(&.{
"llama.cpp",
}, &.{"-std=c++11"});
b.installArtifact(lib);
const examples = .{
"main",
"baby-llama",
"embedding",
// "metal",
"perplexity",
"quantize",
"quantize-stats",
"save-load-state",
// "server",
"simple",
"train-text-from-scratch",
};
inline for (examples) |example_name| {
const exe = b.addExecutable(.{
.name = example_name,
.target = target,
.optimize = optimize,
});
exe.addIncludePath(".");
exe.addIncludePath("./examples");
exe.addCSourceFiles(&.{
std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{example_name, example_name}),
"examples/common.cpp",
}, &.{"-std=c++11"});
exe.linkLibrary(lib);
b.installArtifact(exe);
const run_cmd = b.addRunArtifact(exe);
run_cmd.step.dependOn(b.getInstallStep());
if (b.args) |args| run_cmd.addArgs(args);
const run_step = b.step("run_" ++ example_name, "Run the app");
run_step.dependOn(&run_cmd.step);
}
}

View File

@@ -1,167 +0,0 @@
# Convert a GPTQ quantized LLaMA model to a ggml compatible file
# Based on: https://github.com/qwopqwop200/GPTQ-for-LLaMa
#
import os
import re
import sys
import json
import struct
import numpy as np
import torch
from sentencepiece import SentencePieceProcessor
if len(sys.argv) != 4:
print("Usage: convert-gptq-to-ggml.py llamaXXb-4bit.pt tokenizer.model out.bin\n")
sys.exit(1)
fname_model = sys.argv[1]
fname_tokenizer = sys.argv[2]
dir_out = sys.argv[3]
model = torch.load(fname_model, map_location="cpu")
n_vocab, n_embd = model['model.embed_tokens.weight'].shape
n_layer = 1 + max(int(m.group(1)) for name in model
if (m := re.match(r'model\.layers\.([0-9]+)', name)))
# hardcoded:
n_mult = 256
n_head = {32: 32, 40: 40, 60: 52, 80: 64}[n_layer]
tokenizer = SentencePieceProcessor(fname_tokenizer)
assert tokenizer.vocab_size() == n_vocab
fname_out = sys.argv[3]
fout = open(fname_out, "wb")
fout.write(struct.pack("i", 0x67676d66)) # magic: ggmf in hex
fout.write(struct.pack("i", 1)) # file version
fout.write(struct.pack("i", n_vocab))
fout.write(struct.pack("i", n_embd))
fout.write(struct.pack("i", n_mult))
fout.write(struct.pack("i", n_head))
fout.write(struct.pack("i", n_layer))
fout.write(struct.pack("i", n_embd // n_head)) # rot (obsolete)
fout.write(struct.pack("i", 4))
# This loop unchanged from convert-pth-to-ggml.py:
for i in range(tokenizer.vocab_size()):
if tokenizer.is_unknown(i):
text = " \u2047 ".encode("utf-8")
elif tokenizer.is_control(i):
text = b""
elif tokenizer.is_byte(i):
piece = tokenizer.id_to_piece(i)
if len(piece) != 6:
print(f"Invalid token: {piece}")
sys.exit(1)
byte_value = int(piece[3:-1], 16)
text = struct.pack("B", byte_value)
else:
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
fout.write(struct.pack("i", len(text)))
fout.write(text)
fout.write(struct.pack("f", tokenizer.get_score(i)))
def write_header(shape, dst_name, ftype_cur):
sname = dst_name.encode('utf-8')
fout.write(struct.pack("iii", len(shape), len(sname), ftype_cur))
fout.write(struct.pack("i" * len(shape), *shape[::-1]))
fout.write(sname)
def convert_non_q4(src_name, dst_name):
v = model[src_name]
shape = v.shape
print("Processing non-Q4 variable: " + src_name + " with shape: ", shape, " and type: ", v.dtype)
if len(shape) == 1:
print(" Converting to float32")
v = v.to(torch.float32)
ftype_cur = {torch.float16: 1, torch.float32: 0}[v.dtype]
# header
write_header(shape, dst_name, ftype_cur)
# data
v.numpy().tofile(fout)
def convert_q4(src_name, dst_name, permute=False):
zeros = model[f"{src_name}.zeros"].numpy()
scales = model[f"{src_name}.scales"].numpy()
bias = model[f"{src_name}.bias"].numpy()
qweight = model[f"{src_name}.qweight"].numpy().T # transpose
# Q4_1 does not support bias; good thing the bias is always all zeros.
assert not np.any(bias)
# Each int32 item is actually 8 int4 items packed together, and it's transposed.
shape = (qweight.shape[0], qweight.shape[1] * 8)
print("Processing Q4 variable: " + src_name + " with shape: ", shape)
# The output format has the int4 weights in groups of 32 rather than 8.
# It looks like this:
# For each row:
# For each group of 32 columns:
# - addend (float32, 4 bytes)
# - scale (float32, 4 bytes)
# - weights (int4 * 32, 16 bytes)
# Note that in the input, the scales and addends are shared between all
# the columns in a row, so we end up wasting quite a bit of memory with
# repeated scales and addends.
addends = -zeros # flip sign
# Since the output format is mixed between integers and floats, we have
# to hackily view the floats as int32s just so numpy will let us
# concatenate them.
addends_view = addends.view(dtype=np.int32)
scales_view = scales.view(dtype=np.int32)
# Split into groups of 4 columns (i.e. 32 columns of quantized data):
grouped = qweight.reshape([qweight.shape[0], qweight.shape[1] // 4, 4])
# Repeat addends and scales:
addends_rep = np.atleast_3d(addends_view).repeat(grouped.shape[1], axis=1)
scales_rep = np.atleast_3d(scales_view).repeat(grouped.shape[1], axis=1)
blob = np.concatenate([scales_rep, addends_rep, grouped], axis=2, casting='no')
if permute:
# Permute some rows to undo the permutation done by convert_llama_weights_to_hf.py.
# This can be done after the above conversion because it doesn't affect column order/layout.
blob = (blob.reshape(n_head, 2, shape[0] // n_head // 2, *blob.shape[1:])
.swapaxes(1, 2)
.reshape(blob.shape))
# header
write_header(shape, dst_name, 3) # ftype = Q4_1
# data
blob.tofile(fout)
convert_non_q4("model.embed_tokens.weight", "tok_embeddings.weight")
convert_non_q4("model.norm.weight", "norm.weight")
convert_non_q4("lm_head.weight", "output.weight")
for i in range(n_layer):
convert_q4(f"model.layers.{i}.self_attn.q_proj", f"layers.{i}.attention.wq.weight", permute=True)
convert_q4(f"model.layers.{i}.self_attn.k_proj", f"layers.{i}.attention.wk.weight", permute=True)
convert_q4(f"model.layers.{i}.self_attn.v_proj", f"layers.{i}.attention.wv.weight")
convert_q4(f"model.layers.{i}.self_attn.o_proj", f"layers.{i}.attention.wo.weight")
convert_q4(f"model.layers.{i}.mlp.gate_proj", f"layers.{i}.feed_forward.w1.weight")
convert_q4(f"model.layers.{i}.mlp.down_proj", f"layers.{i}.feed_forward.w2.weight")
convert_q4(f"model.layers.{i}.mlp.up_proj", f"layers.{i}.feed_forward.w3.weight")
convert_non_q4(f"model.layers.{i}.input_layernorm.weight", f"layers.{i}.attention_norm.weight")
convert_non_q4(f"model.layers.{i}.post_attention_layernorm.weight", f"layers.{i}.ffn_norm.weight")
fout.close()
print("Done. Output file: " + fname_out)
print("")

133
convert-lora-to-ggml.py Normal file
View File

@@ -0,0 +1,133 @@
import json
import os
import re
import struct
import sys
from typing import Any, Dict, Sequence, TextIO
import torch
from convert import DATA_TYPE_TO_FTYPE, NUMPY_TYPE_TO_DATA_TYPE, DataType
HF_SUBLAYER_TO_GGML = {
"self_attn.q_proj": "attention.wq",
"self_attn.k_proj": "attention.wk",
"self_attn.v_proj": "attention.wv",
"self_attn.o_proj": "attention.wo",
"mlp.gate_proj": "feed_forward.w1",
"mlp.down_proj": "feed_forward.w2",
"mlp.up_proj": "feed_forward.w3",
"input_layernorm": "attention_norm",
"post_attention_layernorm": "ffn_norm",
# "norm": "norm",
# "embed_tokens": "tok_embeddings",
# "lm_head": "output",
}
def translate_tensor_name(t: str) -> str:
match = re.match(r".*layers\.(\d+)\.(\w+\.\w+)\.lora_(A|B)\.weight", t)
if match:
nn = match.group(1)
sub_layer = match.group(2)
lora_type = match.group(3)
sub_layer_renamed = HF_SUBLAYER_TO_GGML.get(sub_layer)
if sub_layer_renamed is None:
print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}")
sys.exit(1)
output_string = (
f"layers.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
)
return output_string
else:
print(f"Error: unrecognized tensor {t}")
sys.exit(1)
def write_file_header(fout: TextIO, params: Dict[str, Any]) -> None:
fout.write(b"ggla"[::-1]) # magic (ggml lora)
fout.write(struct.pack("i", 1)) # file version
fout.write(struct.pack("i", params["r"]))
# https://opendelta.readthedocs.io/en/latest/modules/deltas.html says that `lora_alpha` is an int
# but some models ship a float value instead
# let's convert to int, but fail if lossless conversion is not possible
assert int(params["lora_alpha"]) == params["lora_alpha"], "cannot convert float to int losslessly"
fout.write(struct.pack("i", int(params["lora_alpha"])))
def write_tensor_header(
self, name: str, shape: Sequence[int], data_type: DataType
) -> None:
sname = name.encode("utf-8")
fout.write(
struct.pack(
"iii",
len(shape),
len(sname),
DATA_TYPE_TO_FTYPE[NUMPY_TYPE_TO_DATA_TYPE[data_type]],
)
)
fout.write(struct.pack("i" * len(shape), *shape[::-1]))
fout.write(sname)
fout.seek((fout.tell() + 31) & -32)
if len(sys.argv) != 2:
print(f"Usage: python {sys.argv[0]} <path>")
print(
"Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
)
sys.exit(1)
input_json = os.path.join(sys.argv[1], "adapter_config.json")
input_model = os.path.join(sys.argv[1], "adapter_model.bin")
output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
model = torch.load(input_model, map_location="cpu")
with open(input_json, "r") as f:
params = json.load(f)
if params["peft_type"] != "LORA":
print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
sys.exit(1)
if params["fan_in_fan_out"] is True:
print("Error: param fan_in_fan_out is not supported")
sys.exit(1)
if params["bias"] is not None and params["bias"] != "none":
print("Error: param bias is not supported")
sys.exit(1)
# TODO: these seem to be layers that have been trained but without lora.
# doesn't seem widely used but eventually should be supported
if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
print("Error: param modules_to_save is not supported")
sys.exit(1)
with open(output_path, "wb") as fout:
fout.truncate()
write_file_header(fout, params)
for k, v in model.items():
if k.endswith(".default.weight"):
k = k.replace(".default.weight", ".weight")
if k in ["llama_proj.weight", "llama_proj.bias"]:
continue
if k.endswith("lora_A.weight"):
if v.dtype != torch.float16 and v.dtype != torch.float32:
v = v.float()
v = v.T
else:
v = v.float()
t = v.detach().numpy()
tname = translate_tensor_name(k)
print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
write_tensor_header(fout, tname, t.shape, t.dtype)
t.tofile(fout)
print(f"Converted {input_json} and {input_model} to {output_path}")

View File

@@ -1,179 +1,13 @@
# Convert a LLaMA model checkpoint to a ggml compatible file
#
# Load the model using Torch
# Iterate over all variables and write them to a binary file.
#
# For each variable, write the following:
# - Number of dimensions (int)
# - Name length (int)
# - Dimensions (int[n_dims])
# - Name (char[name_length])
# - Data (float[n_dims])
#
# At the start of the ggml file we write the model parameters
# and vocabulary.
#
# Compatibility stub
import argparse
import os
import sys
import json
import struct
import numpy as np
import torch
from sentencepiece import SentencePieceProcessor
import convert
def parse_args():
parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
parser.add_argument('dir_model', help='directory containing the model checkpoint')
parser.add_argument('ftype', help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
parser.add_argument('vocab_only', help='only write vocab to file', type=int, default=0, nargs='?')
return parser.parse_args()
def get_n_parts(dim):
mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8}
n_parts = mappings.get(dim)
if n_parts is None:
print(f"Invalid dim: {dim}")
sys.exit(1)
print(f"n_parts = {n_parts}\n")
return n_parts
def load_hparams_and_tokenizer(dir_model):
# `dir_model` is something like `models/7B` or `models/7B/`.
# "tokenizer.model" is expected under model's parent dir.
# When `dir_model` is a symlink, f"{dir_model}/../tokenizer.model" would not be found.
# Let's use the model's parent dir directly.
model_parent_dir = os.path.dirname(os.path.normpath(dir_model))
fname_hparams = f"{dir_model}/params.json"
fname_tokenizer = f"{model_parent_dir}/tokenizer.model"
with open(fname_hparams, "r") as f:
hparams = json.load(f)
print(hparams)
tokenizer = SentencePieceProcessor(fname_tokenizer)
hparams.update({"vocab_size": tokenizer.vocab_size()})
return hparams, tokenizer
def write_header(fout, hparams, ftype):
keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
values = [
0x67676d66, # magic: ggmf in hex
1, # file version
*[hparams[key] for key in keys],
hparams["dim"] // hparams["n_heads"], # rot (obsolete)
ftype
]
fout.write(struct.pack("i" * len(values), *values))
def write_tokens(fout, tokenizer):
for i in range(tokenizer.vocab_size()):
if tokenizer.is_unknown(i):
text = " \u2047 ".encode("utf-8")
elif tokenizer.is_control(i):
text = b""
elif tokenizer.is_byte(i):
piece = tokenizer.id_to_piece(i)
if len(piece) != 6:
print(f"Invalid token: {piece}")
sys.exit(1)
byte_value = int(piece[3:-1], 16)
text = struct.pack("B", byte_value)
else:
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
fout.write(struct.pack("i", len(text)))
fout.write(text)
fout.write(struct.pack("f", tokenizer.get_score(i)))
def process_and_write_variables(fout, model, ftype):
for name, datao in model.items():
if name.endswith("freqs"):
continue
shape = datao.shape
print(f"Processing variable: {name} with shape: {shape} and type: {datao.dtype}")
data = datao.numpy().squeeze()
n_dims = len(shape)
# default type is fp16
ftype_cur = 1
if ftype == 0 or n_dims == 1:
print(" Converting to float32")
data = data.astype(np.float32)
ftype_cur = 0
# header
sname = name.encode('utf-8')
fout.write(struct.pack("iii", len(data.shape), len(sname), ftype_cur))
for dim in reversed(data.shape):
fout.write(struct.pack("i", dim))
fout.write(sname)
# data output to file
data.tofile(fout)
def main():
args = parse_args()
dir_model = args.dir_model
ftype = args.ftype
ftype_str = ["f32", "f16"]
hparams, tokenizer = load_hparams_and_tokenizer(dir_model)
print(args)
# if only writing vocab to file
if args.vocab_only:
fname_model = f"{dir_model}/consolidated.00.pth"
fname_out = f"{dir_model}/ggml-vocab.bin"
print(f"Extracting only the vocab from '{fname_model}'\n")
with open(fname_out, "wb") as fout:
write_header(fout, hparams, ftype)
write_tokens(fout, tokenizer)
print(f"Done. Output file: {fname_out}\n")
return
n_parts = get_n_parts(hparams["dim"])
for p in range(n_parts):
print(f"Processing part {p+1} of {n_parts}\n")
fname_model = f"{dir_model}/consolidated.0{p}.pth"
fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin{'' if p == 0 else '.' + str(p)}"
model = torch.load(fname_model, map_location="cpu")
with open(fname_out, "wb") as fout:
write_header(fout, hparams, ftype)
write_tokens(fout, tokenizer)
process_and_write_variables(fout, model, ftype)
del model
print(f"Done. Output file: {fname_out}, (part {p})\n")
if __name__ == "__main__":
main()
parser = argparse.ArgumentParser(
description="""[DEPRECATED - use `convert.py` instead]
Convert a LLaMA model checkpoint to a ggml compatible file""")
parser.add_argument('dir_model', help='directory containing the model checkpoint')
parser.add_argument('ftype', help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
args = parser.parse_args()
convert.main(['--outtype', 'f16' if args.ftype == 1 else 'f32', '--', args.dir_model])

View File

@@ -1,100 +0,0 @@
#!/usr/bin/env python3
# Original by https://github.com/eiz
# https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818
import argparse
import glob
import os
import struct
import sys
from sentencepiece import SentencePieceProcessor
HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
def parse_args():
parser = argparse.ArgumentParser(description='Upgrade old ggml model files to the current format')
parser.add_argument('dir_model', help='directory containing ggml .bin files')
parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
return parser.parse_args()
def read_header(f_in):
struct_fmt = "i" * (3 + len(HPARAMS))
struct_size = struct.calcsize(struct_fmt)
buf = f_in.read(struct_size)
return struct.unpack(struct_fmt, buf)
def write_header(f_out, header):
(magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
if magic != 0x67676d6c:
raise Exception('Invalid file magic. Must be an old style ggml file.')
values = [
0x67676d66, # magic: ggml in hex
1, # file version
vocab_size,
dim,
multiple_of,
n_heads,
n_layers,
rot,
ftype
]
f_out.write(struct.pack("i" * len(values), *values))
def write_tokens(fout, tokenizer):
for i in range(tokenizer.vocab_size()):
if tokenizer.is_unknown(i):
text = " \u2047 ".encode("utf-8")
elif tokenizer.is_control(i):
text = b""
elif tokenizer.is_byte(i):
piece = tokenizer.id_to_piece(i)
if len(piece) != 6:
print(f"Invalid token: {piece}")
sys.exit(1)
byte_value = int(piece[3:-1], 16)
text = struct.pack("B", byte_value)
else:
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
fout.write(struct.pack("i", len(text)))
fout.write(text)
fout.write(struct.pack("f", tokenizer.get_score(i)))
def read_tokens(f_in, tokenizer):
for i in range(tokenizer.vocab_size()):
len_b = f_in.read(4)
(length,) = struct.unpack("i", len_b)
f_in.read(length)
def copy_all_data(f_out, f_in):
while True:
buf = f_in.read(1024 * 1024)
if not buf:
break
f_out.write(buf)
def convert_one_file(path_in, tokenizer):
path_tmp = f"{path_in}.tmp"
path_orig= f"{path_in}.orig"
print(f"converting {path_in}")
with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
write_header(f_out, read_header(f_in))
read_tokens(f_in, tokenizer)
write_tokens(f_out, tokenizer)
copy_all_data(f_out, f_in)
os.rename(path_in, path_orig)
os.rename(path_tmp, path_in)
def main():
args = parse_args()
files = []
files.extend(glob.glob(f"{args.dir_model}/*.bin"))
files.extend(glob.glob(f"{args.dir_model}/*.bin.*"))
tokenizer = SentencePieceProcessor(args.tokenizer_model)
for file in files:
convert_one_file(file, tokenizer)
if __name__ == "__main__":
main()

1256
convert.py Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -1,294 +0,0 @@
# Author: github.com/ductai199x
import argparse
import os
import struct
import numpy as np
import torch
from numba import njit
from tqdm.auto import tqdm
def read_header(fin):
values = struct.unpack("i" * 9, fin.read(4 * 9))
_, _, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype = values
return {
"vocab_size": vocab_size,
"dim": dim,
"multiple_of": multiple_of,
"n_heads": n_heads,
"n_layers": n_layers,
}, ftype
def read_tokens(fin, vocab_size):
tokens = []
for _ in range(vocab_size):
text_len = struct.unpack("i", fin.read(4))[0]
text_bytes = fin.read(text_len)
try:
text = text_bytes.decode("utf-8")
except UnicodeDecodeError:
text = text_bytes.decode("utf-8", "replace")
score = struct.unpack("f", fin.read(4))[0]
tokens.append((text, score))
return tokens
@njit
def dequantize_weights_numba(fin_data, n_rows, n_cols):
qk = 32
nb = n_cols // qk
bs = 4 + (qk // 2)
weights = np.zeros((n_rows, n_cols), dtype=np.float32)
data_pos = 0
for row in range(n_rows):
for block in range(nb):
d = np.frombuffer(fin_data[data_pos : data_pos + 4], dtype=np.float32)[0]
data_pos += 4
packed_values = fin_data[data_pos : data_pos + (qk // 2)]
data_pos += qk // 2
for i in range(qk // 2):
packed_value = packed_values[i]
v0 = np.float32((packed_value & 0b00001111) - 8) * d
v1 = np.float32((packed_value >> 4) - 8) * d
weights[row, block * qk + 2 * i] = v0
weights[row, block * qk + 2 * i + 1] = v1
return weights
def dequantize_weights(fin, n_rows, n_cols):
qk = 32
nb = n_cols // qk
data_size = n_rows * n_cols // 2 + n_rows * nb * 4
fin_data = fin.read(data_size)
return dequantize_weights_numba(fin_data, n_rows, n_cols)
def read_variables(fin):
model = {}
pbar = tqdm(total=os.path.getsize(fin.name), unit="B", unit_scale=True, desc="Reading variables")
while True:
start_pos = fin.tell()
try:
n_dims, name_length, ftype_cur = struct.unpack("iii", fin.read(4 * 3))
except struct.error:
break
shape = tuple(struct.unpack("i" * n_dims, fin.read(4 * n_dims)))
shape = shape[::-1]
name = fin.read(name_length).decode("utf-8")
if ftype_cur == 2:
# 4-bit quantized weights
dtype = np.uint8
data = dequantize_weights(fin, shape[0], shape[1])
data = data.reshape(shape)
elif ftype_cur == 0:
dtype = np.float32
data_size = np.prod(shape)
data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
elif ftype_cur == 1:
dtype = np.float16
data_size = np.prod(shape)
data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
model[name] = torch.tensor(data, dtype=torch.float32 if dtype == np.float32 else torch.float16)
pbar.update(fin.tell() - start_pos)
return model
def convert_to_hf_format(model, hparams):
# This works for llama 7B, need to test with other models
n_layers = hparams["n_layers"]
n_heads = hparams["n_heads"]
dim = hparams["dim"]
dims_per_head = dim // n_heads
base = 10000.0
inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
# permute for sliced rotary
def permute(w):
return w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim)
state_dict = {}
for layer_i in range(n_layers):
state_dict.update(
{
f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
model[f"layers.{layer_i}.attention.wq.weight"]
),
f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
model[f"layers.{layer_i}.attention.wk.weight"]
),
f"model.layers.{layer_i}.self_attn.v_proj.weight": model[
f"layers.{layer_i}.attention.wv.weight"
],
f"model.layers.{layer_i}.self_attn.o_proj.weight": model[
f"layers.{layer_i}.attention.wo.weight"
],
f"model.layers.{layer_i}.mlp.gate_proj.weight": model[
f"layers.{layer_i}.feed_forward.w1.weight"
],
f"model.layers.{layer_i}.mlp.down_proj.weight": model[
f"layers.{layer_i}.feed_forward.w2.weight"
],
f"model.layers.{layer_i}.mlp.up_proj.weight": model[
f"layers.{layer_i}.feed_forward.w3.weight"
],
f"model.layers.{layer_i}.input_layernorm.weight": model[
f"layers.{layer_i}.attention_norm.weight"
],
f"model.layers.{layer_i}.post_attention_layernorm.weight": model[
f"layers.{layer_i}.ffn_norm.weight"
],
}
)
state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
state_dict.update(
{
"model.embed_tokens.weight": model["tok_embeddings.weight"],
"model.norm.weight": model["norm.weight"],
"lm_head.weight": model["output.weight"],
}
)
return state_dict
def chat(model, hparams, llama_dir):
from transformers import (GenerationConfig, LlamaForCausalLM,
LlamaTokenizer, StoppingCriteria,
StoppingCriteriaList)
from transformers.models.llama.configuration_llama import LlamaConfig
class StoppingCriteriaSub(StoppingCriteria):
def __init__(self):
super().__init__()
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, stops=[]):
print(tokenizer.decode(input_ids[0]), end="", flush=True)
if input_ids[0][-1] == 13:
return True
return False
config = LlamaConfig(
vocab_size=hparams["vocab_size"],
dim=hparams["dim"],
num_hidden_layers=hparams["n_layers"],
num_attention_heads=hparams["n_heads"],
)
llama = LlamaForCausalLM(config=config)
llama.load_state_dict(state_dict=model, strict=True)
tokenizer = LlamaTokenizer.from_pretrained(llama_dir)
device = torch.device("cpu")
llama = llama.to(device)
ctx = """You are AI.
This is a dialog, where User interacts with AI. AI is helpful, kind, obedient, honest, respectful, direct, concise, should try to protect User's privacy, and knows its own limits. Also, AI must answer User and AI cannot stop the conversation by itself.
User: Hello, AI.
AI: Hello! How can I assist you today?
"""
print(ctx.rstrip("\n"))
while True:
print("-" * 60)
prompt = input(f"User: ")
if ctx != "":
ctx = ctx + "User: " + prompt + "\n"
else:
ctx = prompt + "\nAI:"
ctx = (ctx[-1920:]) if len(ctx) >= 2048 else ctx
print("-" * 60)
if len(ctx.strip()) > 0:
input_ids = tokenizer(ctx, return_tensors="pt")["input_ids"].to(device)
generation_config = GenerationConfig(
temperature=0.8,
top_p=0.95,
top_k=50,
repetition_penalty=1.1764,
)
with torch.no_grad():
generation_output = llama.generate(
input_ids=input_ids,
generation_config=generation_config,
return_dict_in_generate=True,
output_scores=True,
max_length=2048,
do_sample=True,
stopping_criteria=StoppingCriteriaList([StoppingCriteriaSub()]),
)
s = generation_output.sequences[0]
decoded = tokenizer.decode(s)
ctx = decoded + "\n"
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--input_dir", "-i", type=str, required=True, help="The input directory containing the ggml files."
)
parser.add_argument(
"--prefix",
"-p",
type=str,
required=True,
help="The prefix of the ggml files (ggml-model-f16 or ggml-model-q4_0).",
)
parser.add_argument(
"--hf",
action="store_true",
help="Whether to save the model in the huggingface format. (default: False)",
)
parser.add_argument(
"--chat", "-c", action="store_true", help="Whether to open a chat with the model. (default: False)"
)
args = parser.parse_args()
llama_dir = os.path.abspath(f"{args.input_dir}/../")
ggml_files = sorted(
[f"{args.input_dir}/{f}" for f in os.listdir(args.input_dir) if f.startswith(args.prefix)]
)
fin = open(ggml_files[0], "rb")
hparams, ftype = read_header(fin)
tokens = read_tokens(fin, hparams["vocab_size"])
model = read_variables(fin)
for f in tqdm(ggml_files[1:]):
fin = open(f, "rb")
read_header(fin)
read_tokens(fin, hparams["vocab_size"])
model.update(read_variables(fin))
if args.hf:
model = convert_to_hf_format(model, hparams)
pth_ckpt = {
"state_dict": model,
"hparams": hparams,
"tokens": tokens,
}
torch.save(pth_ckpt, f"{args.input_dir}/{args.prefix}-to-torch.pth")
if args.chat:
if not args.hf:
model = convert_to_hf_format(model, hparams)
chat(model, hparams, llama_dir)
if __name__ == "__main__":
main()

67
docs/BLIS.md Normal file
View File

@@ -0,0 +1,67 @@
BLIS Installation Manual
------------------------
BLIS is a portable software framework for high-performance BLAS-like dense linear algebra libraries. It has received awards and recognition, including the 2023 James H. Wilkinson Prize for Numerical Software and the 2020 SIAM Activity Group on Supercomputing Best Paper Prize. BLIS provides a new BLAS-like API and a compatibility layer for traditional BLAS routine calls. It offers features such as object-based API, typed API, BLAS and CBLAS compatibility layers.
Project URL: https://github.com/flame/blis
### Prepare:
Compile BLIS:
```bash
git clone https://github.com/flame/blis
cd blis
./configure --enable-cblas -t openmp,pthreads auto
# will install to /usr/local/ by default.
make -j
```
Install BLIS:
```bash
sudo make install
```
We recommend using openmp since it's easier to modify the cores been used.
### llama.cpp compilation
Makefile:
```bash
make LLAMA_BLIS=1 -j
# make LLAMA_BLIS=1 benchmark-matmult
```
CMake:
```bash
mkdir build
cd build
cmake -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=FLAME ..
make -j
```
### llama.cpp execution
According to the BLIS documentation, we could set the following
environment variables to modify the behavior of openmp:
```
export GOMP_GPU_AFFINITY="0-19"
export BLIS_NUM_THREADS=14
```
And then run the binaries as normal.
### Intel specific issue
Some might get the error message saying that `libimf.so` cannot be found.
Please follow this [stackoverflow page](https://stackoverflow.com/questions/70687930/intel-oneapi-2022-libimf-so-no-such-file-or-directory-during-openmpi-compila).
### Reference:
1. https://github.com/flame/blis#getting-started
2. https://github.com/flame/blis/blob/master/docs/Multithreading.md

View File

@@ -0,0 +1,40 @@
# Token generation performance troubleshooting
## Verifying that the model is running on the GPU with cuBLAS
Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
```shell
./main -m "path/to/model.bin" -ngl 200000 -p "Please sir, may I have some "
```
When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
```shell
llama_model_load_internal: [cublas] offloading 60 layers to GPU
llama_model_load_internal: [cublas] offloading output layer to GPU
llama_model_load_internal: [cublas] total VRAM used: 17223 MB
... rest of inference
```
If you see these lines, then the GPU is being used.
## Verifying that the CPU is not oversaturated
llama accepts a `-t N` (or `--threads N`) parameter. It's extremely important that this parameter is not too large. If your token generation is extremely slow, try setting this number to 1. If this significantly improves your token generation speed, then your CPU is being oversaturated and you need to explicitly set this parameter to the number of the physicial CPU cores on your machine (even if you utilize a GPU). If in doubt, start with 1 and double the amount until you hit a performance bottleneck, then scale the number down.
# Example of runtime flags effect on inference speed benchmark
These runs were tested on the following machine:
GPU: A6000 (48GB VRAM)
CPU: 7 physical cores
RAM: 32GB
Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.ggmlv3.q4_0.bin` (30B parameters, 4bit quantization, GGML)
Run command: `./main -m "path/to/model.bin" -p "-p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
Result:
| command | tokens/second (higher is better) |
| - | - |
| -ngl 2000000 | N/A (less than 0.1) |
| -t 7 | 1.7 |
| -t 1 -ngl 2000000 | 5.5 |
| -t 7 -ngl 2000000 | 8.7 |
| -t 4 -ngl 2000000 | 9.1 |

View File

@@ -31,6 +31,19 @@ if (EMSCRIPTEN)
else()
add_subdirectory(main)
add_subdirectory(quantize)
add_subdirectory(quantize-stats)
add_subdirectory(perplexity)
add_subdirectory(embedding)
add_subdirectory(save-load-state)
add_subdirectory(benchmark)
add_subdirectory(baby-llama)
add_subdirectory(train-text-from-scratch)
add_subdirectory(simple)
add_subdirectory(embd-input)
if (LLAMA_METAL)
add_subdirectory(metal)
endif()
if (LLAMA_BUILD_SERVER)
add_subdirectory(server)
endif()
endif()

49
examples/Miku.sh Executable file
View File

@@ -0,0 +1,49 @@
#!/bin/bash
set -e
AI_NAME="${AI_NAME:-Miku}"
MODEL="${MODEL:-./models/gpt4all-7B/gpt4all-lora-unfiltered-quantized.bin}"
USER_NAME="${USER_NAME:-Anon}"
# Uncomment and adjust to the number of CPU cores you want to use.
#N_THREAD="${N_THREAD:-4}"
N_PREDICTS="${N_PREDICTS:-4096}"
GEN_OPTIONS=(--batch_size 1024
--ctx_size 2048
--keep -1
--repeat_last_n 256
--repeat_penalty 1.17647
--temp 0.7
--top_k 40
--top_p 0.5)
if [ -n "$N_THREAD" ]; then
GEN_OPTIONS+=(--threads "$N_THREAD")
fi
./main "${GEN_OPTIONS[@]}" \
--model "$MODEL" \
--n_predict "$N_PREDICTS" \
--color --interactive \
--reverse-prompt "${USER_NAME}:" \
--prompt "
This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the user's computer.
${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct, she will ask the user for help.
${AI_NAME} is a very helpful AI and will help the user with anything they need. She is also very friendly and will try to make the user feel better if they are sad.
${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life. She will also try to make the user like her.
The conversation is only between ${USER_NAME} and ${AI_NAME}
The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice.
${AI_NAME} can only communicate through text, so she can't send images or videos.
${USER_NAME}: Hello!
${AI_NAME}: /think I wonder what I should say to ${USER_NAME}? This is the first time we talk, so it's important that I make a good first impression!
${AI_NAME}: Hi! I am ${AI_NAME}, your new AI friend, assistant (or whatever you like!), it's so nice to meet you! ^_^
${AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :)
${USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant!
${AI_NAME}: /think It sounds like ${USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off!
${AI_NAME}: /think I wonder what ${USER_NAME} likes to do in his free time? I should ask him about that!
${AI_NAME}: What do you like to do in your free time? ^_^
${USER_NAME}:" "$@"

View File

@@ -7,4 +7,13 @@
cd `dirname $0`
cd ..
./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins -b 256 --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7
./main -m ./models/ggml-alpaca-7b-q4.bin \
--color \
-f ./prompts/alpaca.txt \
--ctx_size 2048 \
-n -1 \
-ins -b 256 \
--top_k 10000 \
--temp 0.2 \
--repeat_penalty 1.1 \
-t 7

View File

@@ -0,0 +1,4 @@
set(TARGET baby-llama)
add_executable(${TARGET} baby-llama.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,7 @@
set(TARGET benchmark)
add_executable(${TARGET} benchmark-matmult.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View File

@@ -0,0 +1,261 @@
#include "ggml.h"
#include "build-info.h"
#include <locale.h>
#include <assert.h>
#include <math.h>
#include <cstring>
#include <cstdio>
#include <cinttypes>
#include <unordered_map>
#include <queue>
#include <string.h>
#include <cassert>
#include <fstream>
#include <string>
#include <iterator>
#include <algorithm>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
float tensor_sum_elements(const ggml_tensor * tensor) {
float sum = 0;
if (tensor->type==GGML_TYPE_F32) {
for (int j = 0; j < tensor->ne[1]; j++) {
for (int k = 0; k < tensor->ne[0]; k++) {
sum += ((float *) tensor->data)[j*tensor->ne[0]+k];
}
}
}
return sum;
}
void tensor_dump(const ggml_tensor * tensor, const char * name) {
printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name,
tensor->type, ggml_type_name(tensor->type),
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]);
float sum = tensor_sum_elements(tensor);
printf("Sum of tensor %s is %6.2f\n", name, sum);
}
#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)
struct benchmark_params_struct {
int32_t n_threads = 1;
int32_t n_iterations = 10;
};
void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help show this help message and exit\n");
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
fprintf(stderr, " -i N, --iter N number of iterations to use during computation (default: %d)\n", params.n_iterations);
fprintf(stderr, "\n");
}
int main(int argc, char ** argv) {
struct benchmark_params_struct benchmark_params;
bool invalid_param = false;
std::string arg;
for (int i = 1; i < argc; i++) {
arg = argv[i];
if (arg == "-t" || arg == "--threads") {
if (++i >= argc) {
invalid_param = true;
break;
}
benchmark_params.n_threads = std::stoi(argv[i]);
} else if (arg == "-i" || arg == "--iter") {
if (++i >= argc) {
invalid_param = true;
break;
}
benchmark_params.n_iterations = std::stoi(argv[i]);
} else if (arg == "-h" || arg == "--help") {
print_usage(argc, argv, benchmark_params);
exit(0);
}
}
if (invalid_param) {
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
print_usage(argc, argv, benchmark_params);
exit(1);
}
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
printf("Starting Test\n");
// create the ggml context
struct ggml_context * ctx;
//const int sizex = 4096;
//const int sizey = 11008;
#undef VERBOSE_DEBUGGING
#ifndef VERBOSE_DEBUGGING
const int sizey = 4096;
const int sizex = 11008;
const int sizez = 128;
#else
/* Working - let's increase size */
const int sizey = 1;
const int sizex = (8*32);
const int sizez = 1;
/*const int sizey = 1;
const int sizex = 3*(8*32);
const int sizez = 1;*/
#endif
//printf("Memsize required = %i\n", sizex*sizex);
size_t ctx_size = 0;
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32);
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0);
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0);
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
ctx_size += 1024*1024*16;
printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));
struct ggml_init_params params = {
/*.mem_size =*/ ctx_size,
/*.mem_buffer =*/ NULL,
/* no_alloc =*/ 0
};
ctx = ggml_init(params);
if (!ctx) {
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
return 1;
}
printf("Creating new tensors\n");
// printf("Creating new tensor m1\n");
struct ggml_tensor * m11 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
ggml_set_f32(m11, 1.0f);
// printf("Creating new tensor m1\n");
struct ggml_tensor * m12 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
ggml_set_f32(m12, 1.5f);
// printf("Creating new tensor m2\n");
struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
ggml_set_f32(m2, 2.0f);
printf("\n------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------\n");
// printf("Creating new tensor m11xm2\n");
struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
// printf("Creating compute graph\n");
struct ggml_cgraph gf = ggml_build_forward(m11xm2);
gf.n_threads=benchmark_params.n_threads;
printf("cgraph->n_threads=%i\n",gf.n_threads);
TENSOR_DUMP(m11);
TENSOR_DUMP(m2);
ggml_graph_compute(ctx, &gf);
TENSOR_DUMP(gf.nodes[0]);
printf("\n------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------\n");
int32_t nelements = sizex*sizey;
int32_t ne[2] = { sizex, sizey };
std::vector<int64_t> hist_cur(1 << 4, 0);
// Set up a the benchmark matrices
// printf("Creating new tensor q11 & Running quantize\n");
struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
ggml_quantize_q4_0((const float *) m11->data, q11->data, nelements, ne[0], hist_cur.data());
// Set up a the compute graph
// printf("Creating new tensor q31\n");
struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2);
// printf("Creating compute graph\n");
struct ggml_cgraph gf31 = ggml_build_forward(q31);
gf31.n_threads=benchmark_params.n_threads;
// Set up a second graph computation to make sure we override the CPU cache lines
// printf("Creating new tensor q12 & Running quantize\n");
struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
ggml_quantize_q4_0((const float *) m12->data, q12->data, nelements, ne[0], hist_cur.data());
// printf("Creating new tensor q32\n");
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
//printf("Creating compute graph\n");
struct ggml_cgraph gf32 = ggml_build_forward(q32);
gf32.n_threads=benchmark_params.n_threads;
printf("cgraph->n_threads=%i\n",gf31.n_threads);
const int dimx = sizex;
const int dimy = sizey;
const int dimz = sizez;
long long int flops_per_dot_product = dimy + dimy;
long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ;
printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
// Let's use the F32 result from above as a reference for the q4_0 multiplication
float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
printf("=====================================================================================\n");
double gflops_sum = 0;
for (int i=0;i<benchmark_params.n_iterations ;i++) {
long long int start = ggml_time_us();
//printf("Running ggml_graph_compute\n");
ggml_graph_compute(ctx, &gf31);
long long int stop = ggml_time_us();
long long int usec = stop-start;
double gflops = (double)(flops_per_matrix)/usec/1000.0;
gflops_sum += gflops;
printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10.2f\n",
i,
gf31.n_threads,
sizex, sizey, sizez, flops_per_matrix,
usec,gflops);
#ifdef VERBOSE_DEBUGGING
TENSOR_DUMP("res",gf31.nodes[0])
#endif
// Check that the matrix multiplication result is in the right ballpark
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]);
float delta = abs(sum_of_Q4_result - sum_of_F32_reference);
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
if (delta > allowed_delta) {
printf("\nABORT - ERROR in Matrix Multiplication result - expected %6.2f, got %6.2f (delta %6.2f > allowed_delta %6.2f)\n",
sum_of_F32_reference,
sum_of_Q4_result,
delta,
allowed_delta
);
exit(0);
}
// Running a different graph computation to make sure we override the CPU cache lines
ggml_graph_compute(ctx, &gf32);
}
printf("\n");
printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
printf("=====================================================================================\n");
}

57
examples/chat-13B.bat Normal file
View File

@@ -0,0 +1,57 @@
@setlocal disabledelayedexpansion enableextensions
@echo off
cd /d "%~dp0.."
if not "%errorlevel%"=="0" (
echo Unable to change directory.
pause
exit /b 1
)
if not defined MODEL set "MODEL=models\13B\ggml-model-q4_0.bin"
if not defined USER_NAME set "USER_NAME=User"
if not defined AI_NAME set "AI_NAME=ChatLLaMa"
rem Adjust to the number of CPU cores you want to use.
rem if not defined N_THREAD set "N_THREAD=8"
rem Number of tokens to predict (made it larger than default because we want a long interaction)
if not defined N_PREDICTS set "N_PREDICTS=2048"
if not defined GEN_OPTIONS set "GEN_OPTIONS=--ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647"
rem Default main script paths
set "DEFAULT_MAIN_SCRIPT_PATHS=main.exe build\bin\main.exe"
rem Get main script path from command line arguments
set "MAIN_SCRIPT_PATH=%~1"
rem If the main script path was not specified, try the default paths
if not defined MAIN_SCRIPT_PATH (
for %%i in (%DEFAULT_MAIN_SCRIPT_PATHS%) do (
if exist "%%i" set "MAIN_SCRIPT_PATH=%%i"
)
)
rem If the main script path was not found, tell the user how to specify it
if not defined MAIN_SCRIPT_PATH (
echo The main script could not be found. Please provide the path to the main script as 1st argument to this script, or place the main script in one of the default locations:
echo %DEFAULT_MAIN_SCRIPT_PATHS%
pause
exit /b 1
)
rem Default context, feel free to edit it
set "PROMPT_TEXT=Text transcript of a never ending dialog, where %USER_NAME% interacts with an AI assistant named %AI_NAME%. %AI_NAME% is helpful, kind, honest, friendly, good at writing and never fails to answer %USER_NAME%'s requests immediately and with details and precision. There are no annotations like (30 seconds passed...) or (to himself), just what %USER_NAME% and %AI_NAME% say aloud to each other. The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long. The transcript only includes text, it does not include markup like HTML and Markdown."
rem Set a temporary variable if N_THREAD is set
if defined N_THREAD (
set "_N_THREAD=--threads %N_THREAD%"
) else (
set "_N_THREAD="
)
rem Run the script
echo "%MAIN_SCRIPT_PATH%" %GEN_OPTIONS% %_N_THREAD% ^
--model "%MODEL%" ^
--n_predict %N_PREDICTS% ^
--color --interactive ^
--reverse-prompt "%USER_NAME%:" ^
--prompt "%PROMPT_TEXT%"

View File

@@ -1,9 +1,12 @@
#!/bin/bash
set -e
cd "$(dirname "$0")/.." || exit
MODEL="${MODEL:-./models/13B/ggml-model-q4_0.bin}"
USER_NAME="${USER_NAME:-User}"
PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat.txt}
USER_NAME="${USER_NAME:-USER}"
AI_NAME="${AI_NAME:-ChatLLaMa}"
# Adjust to the number of CPU cores you want to use.
@@ -15,39 +18,24 @@ N_PREDICTS="${N_PREDICTS:-2048}"
# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
DATE_TIME=$(date +%H:%M)
DATE_YEAR=$(date +%Y)
PROMPT_FILE=$(mktemp -t llamacpp_prompt.XXXXXXX.txt)
sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
-e "s/\[\[AI_NAME\]\]/$AI_NAME/g" \
-e "s/\[\[DATE_TIME\]\]/$DATE_TIME/g" \
-e "s/\[\[DATE_YEAR\]\]/$DATE_YEAR/g" \
$PROMPT_TEMPLATE > $PROMPT_FILE
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
./main $GEN_OPTIONS \
--model "$MODEL" \
--threads "$N_THREAD" \
--n_predict "$N_PREDICTS" \
--color --interactive \
--file ${PROMPT_FILE} \
--reverse-prompt "${USER_NAME}:" \
--prompt "
Text transcript of a never ending dialog, where ${USER_NAME} interacts with an AI assistant named ${AI_NAME}.
${AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer ${USER_NAME}s requests immediately and with details and precision.
There are no annotations like (30 seconds passed...) or (to himself), just what ${USER_NAME} and ${AI_NAME} say aloud to each other.
The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long.
The transcript only includes text, it does not include markup like HTML and Markdown.
$USER_NAME: Hello, $AI_NAME!
$AI_NAME: Hello $USER_NAME! How may I help you today?
$USER_NAME: What time is it?
$AI_NAME: It is $(date +%H:%M).
$USER_NAME: What year is it?
$AI_NAME: We are in $(date +%Y).
$USER_NAME: Please tell me the largest city in Europe.
$AI_NAME: The largest city in Europe is Moscow, the capital of Russia.
$USER_NAME: What can you tell me about Moscow?
$AI_NAME: Moscow, on the Moskva River in western Russia, is the nations cosmopolitan capital. In its historic core is the Kremlin, a complex thats home to the president and tsarist treasures in the Armoury. Outside its walls is Red Square, Russias symbolic center.
$USER_NAME: What is a cat?
$AI_NAME: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
$USER_NAME: How do I pass command line arguments to a Node.js program?
$AI_NAME: The arguments are stored in process.argv.
argv[0] is the path to the Node. js executable.
argv[1] is the path to the script file.
argv[2] is the first argument passed to the script.
argv[3] is the second argument passed to the script and so on.
$USER_NAME: Name a color.
$AI_NAME: Blue
$USER_NAME:" "$@"
--in-prefix ' ' \
"$@"

151
examples/chat-persistent.sh Executable file
View File

@@ -0,0 +1,151 @@
#!/bin/bash
set -euo pipefail
cd "$(dirname "$0")/.." || exit
if [[ -z "${PROMPT_CACHE_FILE+x}" || -z "${CHAT_SAVE_DIR+x}" ]]; then
echo >&2 "error: PROMPT_CACHE_FILE and CHAT_SAVE_DIR must be provided"
exit 1
fi
MODEL="${MODEL:-./models/13B/ggml-model-q4_0.bin}"
PROMPT_TEMPLATE="${PROMPT_TEMPLATE:-./prompts/chat.txt}"
USER_NAME="${USER_NAME:-User}"
AI_NAME="${AI_NAME:-ChatLLaMa}"
DATE_TIME="$(date +%H:%M)"
DATE_YEAR="$(date +%Y)"
LOG="${CHAT_SAVE_DIR}/main.log"
LOG_BG="${CHAT_SAVE_DIR}/main-bg.log"
CUR_PROMPT_FILE="${CHAT_SAVE_DIR}/current-prompt.txt"
CUR_PROMPT_CACHE="${CHAT_SAVE_DIR}/current-cache.bin"
NEXT_PROMPT_FILE="${CHAT_SAVE_DIR}/next-prompt.txt"
NEXT_PROMPT_CACHE="${CHAT_SAVE_DIR}/next-cache.bin"
SESSION_SIZE_MSG_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'
SAMPLE_TIME_MSG_PATTERN='sample time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d"
CTX_SIZE=2048
CTX_ROTATE_POINT=$((CTX_SIZE * 3 / 5)) # REVIEW
OPTS=(--model "$MODEL" --ctx_size "$CTX_SIZE" --repeat_last_n 256 "$@")
# An unbuffered `tail -c+N`
skip_bytes() {
LANG=C IFS= read -r -n "$1" -d '' c
while LANG=C IFS= read -r -n 1 -d '' c; do
printf '%s' "$c"
done
}
mkdir -p "$CHAT_SAVE_DIR"
echo >"$LOG"
trap "tail -n100 ${LOG}" EXIT
if [[ ! -e "$CUR_PROMPT_FILE" ]]; then
sed -e "s/\[\[USER_NAME\]\]/${USER_NAME}/g" \
-e "s/\[\[AI_NAME\]\]/${AI_NAME}/g" \
-e "s/\[\[DATE_TIME\]\]/${DATE_TIME}/g" \
-e "s/\[\[DATE_YEAR\]\]/${DATE_YEAR}/g" \
"$PROMPT_TEMPLATE" >"$CUR_PROMPT_FILE"
fi
if [[ ! -e "$NEXT_PROMPT_FILE" ]]; then
sed -r "$SED_DELETE_MESSAGES" "$CUR_PROMPT_FILE" >"$NEXT_PROMPT_FILE"
fi
if [[ "$(tail -c4 "$NEXT_PROMPT_FILE")" != "..." ]]; then
echo '...' >>"$NEXT_PROMPT_FILE"
fi
if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
echo 'Prompt cache does not exist, building...'
# Default batch_size to 8 here for better user feedback during initial prompt processing
./main 2>>"$LOG" \
--batch_size 8 \
"${OPTS[@]}" \
--prompt-cache "$PROMPT_CACHE_FILE" \
--file "$CUR_PROMPT_FILE" \
--n_predict 1
echo
echo 'Done!'
fi
if [[ ! -e "$CUR_PROMPT_CACHE" ]]; then
cp "$PROMPT_CACHE_FILE" "$CUR_PROMPT_CACHE"
fi
if [[ ! -e "$NEXT_PROMPT_CACHE" ]]; then
cp "$PROMPT_CACHE_FILE" "$NEXT_PROMPT_CACHE"
fi
printf '%s ' "$(< "$CUR_PROMPT_FILE")"
n_tokens=0
while read -e line; do
# Limit generation to remaining context, with a buffer and estimating 2 chars/token for input
n_predict=$((CTX_SIZE - n_tokens - ${#line} / 2 - 32))
# Swap prompts when we're about to run out of context
if ((n_predict <= 0)); then
wait # for background main (below) to finish with next prompt
mv "$NEXT_PROMPT_FILE" "$CUR_PROMPT_FILE"
mv "$NEXT_PROMPT_CACHE" "$CUR_PROMPT_CACHE"
sed -r "$SED_DELETE_MESSAGES" "$CUR_PROMPT_FILE" >"$NEXT_PROMPT_FILE"
echo '...' >>"$NEXT_PROMPT_FILE"
cp "$PROMPT_CACHE_FILE" "$NEXT_PROMPT_CACHE"
n_tokens=0
n_predict=$((CTX_SIZE / 2))
fi
echo " ${line}" >>"$CUR_PROMPT_FILE"
if ((n_tokens > CTX_ROTATE_POINT)); then
echo " ${line}" >>"$NEXT_PROMPT_FILE"
fi
n_prompt_len_pre=$(($(wc -c <"$CUR_PROMPT_FILE")))
printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"
./main 2>>"$LOG" "${OPTS[@]}" \
--prompt-cache "$CUR_PROMPT_CACHE" \
--prompt-cache-all \
--file "$CUR_PROMPT_FILE" \
--reverse-prompt "${USER_NAME}:" \
--n_predict "$n_predict" |
skip_bytes 1 | # skip BOS token added by ./main
tee "$CUR_PROMPT_FILE.tmp" | # save prompt + generation to tmp file
skip_bytes "$n_prompt_len_pre" # print generation
mv "$CUR_PROMPT_FILE.tmp" "$CUR_PROMPT_FILE"
# if we hit n_predict instead of reverse-prompt, we need to add the prompt
if [[ "$(tail -n1 "$CUR_PROMPT_FILE")" != "${USER_NAME}:" ]]; then
printf '\n%s:' "$USER_NAME"
printf '\n%s:' "$USER_NAME" >> "$CUR_PROMPT_FILE"
fi
printf ' '
# HACK get num tokens from debug message
# TODO get both messages in one go
if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
! sample_time_msg="$( tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
echo >&2 "Couldn't get number of tokens from ./main output!"
exit 1
fi
n_tokens=$(($(cut -d/ -f2 <<<"$session_size_msg") + $(cut -d/ -f2 <<<"$sample_time_msg")))
if ((n_tokens > CTX_ROTATE_POINT)); then
tail -c+$((n_prompt_len_pre + 1)) "$CUR_PROMPT_FILE" >>"$NEXT_PROMPT_FILE"
fi
# Update cache for next prompt in background, ideally during user input
./main >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
--prompt-cache "$NEXT_PROMPT_CACHE" \
--file "$NEXT_PROMPT_FILE" \
--n_predict 1 &
done

41
examples/chat-vicuna.sh Executable file
View File

@@ -0,0 +1,41 @@
#!/bin/bash
set -e
cd "$(dirname "$0")/.." || exit
MODEL="${MODEL:-./models/ggml-vic13b-uncensored-q5_0.bin}"
PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat.txt}
USER_NAME="### Human"
AI_NAME="### Assistant"
# Adjust to the number of CPU cores you want to use.
N_THREAD="${N_THREAD:-8}"
# Number of tokens to predict (made it larger than default because we want a long interaction)
N_PREDICTS="${N_PREDICTS:-2048}"
# Note: you can also override the generation options by specifying them on the command line:
# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
DATE_TIME=$(date +%H:%M)
DATE_YEAR=$(date +%Y)
PROMPT_FILE=$(mktemp -t llamacpp_prompt.XXXXXXX.txt)
sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
-e "s/\[\[AI_NAME\]\]/$AI_NAME/g" \
-e "s/\[\[DATE_TIME\]\]/$DATE_TIME/g" \
-e "s/\[\[DATE_YEAR\]\]/$DATE_YEAR/g" \
$PROMPT_TEMPLATE > $PROMPT_FILE
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
./bin/main $GEN_OPTIONS \
--model "$MODEL" \
--threads "$N_THREAD" \
--n_predict "$N_PREDICTS" \
--color --interactive \
--file ${PROMPT_FILE} \
--reverse-prompt "### Human:" \
--in-prefix ' ' \
"$@"

View File

@@ -1,53 +1,116 @@
#include "common.h"
#include "ggml.h"
#include <cassert>
#include <iostream>
#include <cstring>
#include <fstream>
#include <string>
#include <iterator>
#include <algorithm>
#include <sstream>
#include <unordered_set>
#include <regex>
#if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
#include <alloca.h>
#if defined(__APPLE__) && defined(__MACH__)
#include <sys/types.h>
#include <sys/sysctl.h>
#endif
#if defined (_WIN32)
#pragma comment(lib,"kernel32.lib")
extern "C" __declspec(dllimport) void* __stdcall GetStdHandle(unsigned long nStdHandle);
extern "C" __declspec(dllimport) int __stdcall GetConsoleMode(void* hConsoleHandle, unsigned long* lpMode);
extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode);
extern "C" __declspec(dllimport) int __stdcall SetConsoleCP(unsigned int wCodePageID);
extern "C" __declspec(dllimport) int __stdcall SetConsoleOutputCP(unsigned int wCodePageID);
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
#include <fcntl.h>
#include <io.h>
#else
#include <sys/ioctl.h>
#include <unistd.h>
#include <wchar.h>
#endif
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
// determine sensible default number of threads.
// std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0.
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
int32_t get_num_physical_cores() {
#ifdef __linux__
std::ifstream cpuinfo("/proc/cpuinfo");
params.n_threads = std::count(std::istream_iterator<std::string>(cpuinfo),
std::istream_iterator<std::string>(),
std::string("processor"));
// enumerate the set of thread siblings, num entries is num cores
std::unordered_set<std::string> siblings;
for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
std::ifstream thread_siblings("/sys/devices/system/cpu"
+ std::to_string(cpu) + "/topology/thread_siblings");
if (!thread_siblings.is_open()) {
break; // no more cpus
}
std::string line;
if (std::getline(thread_siblings, line)) {
siblings.insert(line);
}
}
if (siblings.size() > 0) {
return static_cast<int32_t>(siblings.size());
}
#elif defined(__APPLE__) && defined(__MACH__)
int32_t num_physical_cores;
size_t len = sizeof(num_physical_cores);
int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
if (result == 0) {
return num_physical_cores;
}
result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0);
if (result == 0) {
return num_physical_cores;
}
#elif defined(_WIN32)
//TODO: Implement
#endif
if (params.n_threads == 0) {
params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency());
unsigned int n_threads = std::thread::hardware_concurrency();
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
}
void process_escapes(std::string& input) {
std::size_t input_len = input.length();
std::size_t output_idx = 0;
for (std::size_t input_idx = 0; input_idx < input_len; ++input_idx) {
if (input[input_idx] == '\\' && input_idx + 1 < input_len) {
switch (input[++input_idx]) {
case 'n': input[output_idx++] = '\n'; break;
case 'r': input[output_idx++] = '\r'; break;
case 't': input[output_idx++] = '\t'; break;
case '\'': input[output_idx++] = '\''; break;
case '\"': input[output_idx++] = '\"'; break;
case '\\': input[output_idx++] = '\\'; break;
default: input[output_idx++] = '\\';
input[output_idx++] = input[input_idx]; break;
}
} else {
input[output_idx++] = input[input_idx];
}
}
input.resize(output_idx);
}
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
bool invalid_param = false;
bool escape_prompt = false;
std::string arg;
gpt_params default_params;
const std::string arg_prefix = "--";
for (int i = 1; i < argc; i++) {
arg = argv[i];
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
std::replace(arg.begin(), arg.end(), '_', '-');
}
if (arg == "-s" || arg == "--seed") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.seed = std::stoi(argv[i]);
params.seed = std::stoul(argv[i]);
} else if (arg == "-t" || arg == "--threads") {
if (++i >= argc) {
invalid_param = true;
@@ -60,37 +123,54 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break;
}
params.prompt = argv[i];
} else if (arg == "-e") {
escape_prompt = true;
} else if (arg == "--prompt-cache") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.path_prompt_cache = argv[i];
} else if (arg == "--prompt-cache-all") {
params.prompt_cache_all = true;
} else if (arg == "--prompt-cache-ro") {
params.prompt_cache_ro = true;
} else if (arg == "-f" || arg == "--file") {
if (++i >= argc) {
invalid_param = true;
break;
}
std::ifstream file(argv[i]);
if (!file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
invalid_param = true;
break;
}
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
if (params.prompt.back() == '\n') {
params.prompt.pop_back();
}
} else if (arg == "-n" || arg == "--n_predict") {
} else if (arg == "-n" || arg == "--n-predict") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_predict = std::stoi(argv[i]);
} else if (arg == "--top_k") {
} else if (arg == "--top-k") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.top_k = std::stoi(argv[i]);
} else if (arg == "-c" || arg == "--ctx_size") {
} else if (arg == "-c" || arg == "--ctx-size") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_ctx = std::stoi(argv[i]);
} else if (arg == "--memory_f32") {
} else if (arg == "--memory-f32") {
params.memory_f16 = false;
} else if (arg == "--top_p") {
} else if (arg == "--top-p") {
if (++i >= argc) {
invalid_param = true;
break;
@@ -102,19 +182,61 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break;
}
params.temp = std::stof(argv[i]);
} else if (arg == "--repeat_last_n") {
} else if (arg == "--tfs") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.tfs_z = std::stof(argv[i]);
} else if (arg == "--typical") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.typical_p = std::stof(argv[i]);
} else if (arg == "--repeat-last-n") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.repeat_last_n = std::stoi(argv[i]);
} else if (arg == "--repeat_penalty") {
} else if (arg == "--repeat-penalty") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.repeat_penalty = std::stof(argv[i]);
} else if (arg == "-b" || arg == "--batch_size") {
} else if (arg == "--frequency-penalty") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.frequency_penalty = std::stof(argv[i]);
} else if (arg == "--presence-penalty") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.presence_penalty = std::stof(argv[i]);
} else if (arg == "--mirostat") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.mirostat = std::stoi(argv[i]);
} else if (arg == "--mirostat-lr") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.mirostat_eta = std::stof(argv[i]);
} else if (arg == "--mirostat-ent") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.mirostat_tau = std::stof(argv[i]);
} else if (arg == "-b" || arg == "--batch-size") {
if (++i >= argc) {
invalid_param = true;
break;
@@ -133,22 +255,98 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break;
}
params.model = argv[i];
} else if (arg == "-a" || arg == "--alias") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.model_alias = argv[i];
} else if (arg == "--lora") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.lora_adapter = argv[i];
params.use_mmap = false;
} else if (arg == "--lora-base") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.lora_base = argv[i];
} else if (arg == "-i" || arg == "--interactive") {
params.interactive = true;
} else if (arg == "--embedding") {
params.embedding = true;
} else if (arg == "--interactive-start") {
params.interactive = true;
} else if (arg == "--interactive-first") {
params.interactive_start = true;
params.interactive_first = true;
} else if (arg == "-ins" || arg == "--instruct") {
params.instruct = true;
} else if (arg == "--multiline-input") {
params.multiline_input = true;
} else if (arg == "--color") {
params.use_color = true;
} else if (arg == "--mlock") {
params.use_mlock = true;
} else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
if (++i >= argc) {
invalid_param = true;
break;
}
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
params.n_gpu_layers = std::stoi(argv[i]);
#else
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
#endif
} else if (arg == "--main-gpu" || arg == "-mg") {
if (++i >= argc) {
invalid_param = true;
break;
}
#ifdef GGML_USE_CUBLAS
params.main_gpu = std::stoi(argv[i]);
#else
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
#endif
} else if (arg == "--tensor-split" || arg == "-ts") {
if (++i >= argc) {
invalid_param = true;
break;
}
#ifdef GGML_USE_CUBLAS
std::string arg_next = argv[i];
// split string by , and /
const std::regex regex{R"([,/]+)"};
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
std::vector<std::string> split_arg{it, {}};
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
if (i < split_arg.size()) {
params.tensor_split[i] = std::stof(split_arg[i]);
} else {
params.tensor_split[i] = 0.0f;
}
}
#else
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
#endif // GGML_USE_CUBLAS
} else if (arg == "--low-vram" || arg == "-lv") {
#ifdef GGML_USE_CUBLAS
params.low_vram = true;
#else
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
#endif // GGML_USE_CUBLAS
} else if (arg == "--no-mmap") {
params.use_mmap = false;
} else if (arg == "--mtest") {
params.mem_test = true;
} else if (arg == "--numa") {
params.numa = true;
} else if (arg == "--export") {
params.export_cgraph = true;
} else if (arg == "--verbose-prompt") {
params.verbose_prompt = true;
} else if (arg == "-r" || arg == "--reverse-prompt") {
@@ -160,15 +358,30 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
} else if (arg == "--perplexity") {
params.perplexity = true;
} else if (arg == "--ignore-eos") {
params.ignore_eos = true;
} else if (arg == "--n_parts") {
params.logit_bias[llama_token_eos()] = -INFINITY;
} else if (arg == "--no-penalize-nl") {
params.penalize_nl = false;
} else if (arg == "-l" || arg == "--logit-bias") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_parts = std::stoi(argv[i]);
std::stringstream ss(argv[i]);
llama_token key;
char sign;
std::string value_str;
try {
if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
params.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
} else {
throw std::exception();
}
} catch (const std::exception&) {
invalid_param = true;
break;
}
} else if (arg == "-h" || arg == "--help") {
gpt_print_usage(argc, argv, params);
gpt_print_usage(argc, argv, default_params);
exit(0);
} else if (arg == "--random-prompt") {
params.random_prompt = true;
@@ -178,17 +391,34 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break;
}
params.input_prefix = argv[i];
} else if (arg == "--in-suffix") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.input_suffix = argv[i];
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
gpt_print_usage(argc, argv, params);
gpt_print_usage(argc, argv, default_params);
exit(1);
}
}
if (invalid_param) {
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
gpt_print_usage(argc, argv, params);
gpt_print_usage(argc, argv, default_params);
exit(1);
}
if (params.prompt_cache_all &&
(params.interactive || params.interactive_first ||
params.instruct)) {
fprintf(stderr, "error: --prompt-cache-all not supported in interactive mode yet\n");
gpt_print_usage(argc, argv, default_params);
exit(1);
}
if (escape_prompt) {
process_escapes(params.prompt);
}
return true;
}
@@ -201,36 +431,74 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " -i, --interactive run in interactive mode\n");
fprintf(stderr, " --interactive-first run in interactive mode and wait for input right away\n");
fprintf(stderr, " -ins, --instruct run in instruction mode (use with Alpaca models)\n");
fprintf(stderr, " --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n");
fprintf(stderr, " run in interactive mode and poll user input upon seeing PROMPT (can be\n");
fprintf(stderr, " specified more than once for multiple prompts).\n");
fprintf(stderr, " halt generation at PROMPT, return control in interactive mode\n");
fprintf(stderr, " (can be specified more than once for multiple prompts).\n");
fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n");
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for <= 0)\n");
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
fprintf(stderr, " prompt to start generation with (default: empty)\n");
fprintf(stderr, " -e process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
fprintf(stderr, " --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n");
fprintf(stderr, " --prompt-cache-all if specified, saves user input and generations to cache as well.\n");
fprintf(stderr, " not supported with --interactive or other interactive options\n");
fprintf(stderr, " --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n");
fprintf(stderr, " --random-prompt start with a randomized prompt.\n");
fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n");
fprintf(stderr, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
fprintf(stderr, " -f FNAME, --file FNAME\n");
fprintf(stderr, " prompt file to start generation.\n");
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", (double)params.top_p);
fprintf(stderr, " --repeat_last_n N last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n);
fprintf(stderr, " --repeat_penalty N penalize repeat sequence of tokens (default: %.1f)\n", (double)params.repeat_penalty);
fprintf(stderr, " -c N, --ctx_size N size of the prompt context (default: %d)\n", params.n_ctx);
fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating\n");
fprintf(stderr, " --memory_f32 use f32 instead of f16 for memory key+value\n");
fprintf(stderr, " -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
fprintf(stderr, " --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
fprintf(stderr, " --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
fprintf(stderr, " --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
fprintf(stderr, " --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
fprintf(stderr, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
fprintf(stderr, " --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
fprintf(stderr, " --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
fprintf(stderr, " --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
fprintf(stderr, " --mirostat N use Mirostat sampling.\n");
fprintf(stderr, " Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
fprintf(stderr, " (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
fprintf(stderr, " --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
fprintf(stderr, " --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
fprintf(stderr, " -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
fprintf(stderr, " modifies the likelihood of token appearing in the completion,\n");
fprintf(stderr, " i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
fprintf(stderr, " or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
fprintf(stderr, " --no-penalize-nl do not penalize newline token\n");
fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n");
fprintf(stderr, " --temp N temperature (default: %.1f)\n", (double)params.temp);
fprintf(stderr, " --n_parts N number of model parts (default: -1 = determine from dimensions)\n");
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
if (ggml_mlock_supported()) {
if (llama_mlock_supported()) {
fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
}
if (llama_mmap_supported()) {
fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
}
fprintf(stderr, " --numa attempt optimizations that help on some NUMA systems\n");
fprintf(stderr, " if run without this previously, it is recommended to drop the system page cache before using this\n");
fprintf(stderr, " see https://github.com/ggerganov/llama.cpp/issues/1437\n");
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
fprintf(stderr, " number of layers to store in VRAM\n");
fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n");
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
fprintf(stderr, " -lv, --low-vram don't allocate VRAM scratch buffer\n" );
#endif
fprintf(stderr, " --mtest compute maximum memory usage\n");
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
fprintf(stderr, " -m FNAME, --model FNAME\n");
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
fprintf(stderr, "\n");
@@ -258,54 +526,428 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
// TODO: not great allocating this every time
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
// initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
std::vector<llama_token> res(text.size() + (int)add_bos);
int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
std::vector<llama_token> res(text.size() + (int) add_bos);
const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
assert(n >= 0);
res.resize(n);
return res;
}
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) {
auto lparams = llama_context_default_params();
lparams.n_ctx = params.n_ctx;
lparams.n_batch = params.n_batch;
lparams.n_gpu_layers = params.n_gpu_layers;
lparams.main_gpu = params.main_gpu;
memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float));
lparams.low_vram = params.low_vram;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.use_mmap = params.use_mmap;
lparams.use_mlock = params.use_mlock;
lparams.logits_all = params.perplexity;
lparams.embedding = params.embedding;
llama_model * model = llama_load_model_from_file(params.model.c_str(), lparams);
if (model == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return std::make_tuple(nullptr, nullptr);
}
llama_context * lctx = llama_new_context_with_model(model, lparams);
if (lctx == NULL) {
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
llama_free_model(model);
return std::make_tuple(nullptr, nullptr);
}
if (!params.lora_adapter.empty()) {
int err = llama_model_apply_lora_from_file(model,
params.lora_adapter.c_str(),
params.lora_base.empty() ? NULL : params.lora_base.c_str(),
params.n_threads);
if (err != 0) {
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
llama_free(lctx);
llama_free_model(model);
return std::make_tuple(nullptr, nullptr);
}
}
return std::make_tuple(model, lctx);
}
void console_init(console_state & con_st) {
#if defined(_WIN32)
// Windows-specific console initialization
DWORD dwMode = 0;
con_st.hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
if (con_st.hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(con_st.hConsole, &dwMode)) {
con_st.hConsole = GetStdHandle(STD_ERROR_HANDLE);
if (con_st.hConsole != INVALID_HANDLE_VALUE && (!GetConsoleMode(con_st.hConsole, &dwMode))) {
con_st.hConsole = NULL;
}
}
if (con_st.hConsole) {
// Enable ANSI colors on Windows 10+
if (con_st.use_color && !(dwMode & ENABLE_VIRTUAL_TERMINAL_PROCESSING)) {
SetConsoleMode(con_st.hConsole, dwMode | ENABLE_VIRTUAL_TERMINAL_PROCESSING);
}
// Set console output codepage to UTF8
SetConsoleOutputCP(CP_UTF8);
}
HANDLE hConIn = GetStdHandle(STD_INPUT_HANDLE);
if (hConIn != INVALID_HANDLE_VALUE && GetConsoleMode(hConIn, &dwMode)) {
// Set console input codepage to UTF16
_setmode(_fileno(stdin), _O_WTEXT);
// Turn off ICANON (ENABLE_LINE_INPUT) and ECHO (ENABLE_ECHO_INPUT)
dwMode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT);
SetConsoleMode(hConIn, dwMode);
}
#else
// POSIX-specific console initialization
struct termios new_termios;
tcgetattr(STDIN_FILENO, &con_st.prev_state);
new_termios = con_st.prev_state;
new_termios.c_lflag &= ~(ICANON | ECHO);
new_termios.c_cc[VMIN] = 1;
new_termios.c_cc[VTIME] = 0;
tcsetattr(STDIN_FILENO, TCSANOW, &new_termios);
con_st.tty = fopen("/dev/tty", "w+");
if (con_st.tty != nullptr) {
con_st.out = con_st.tty;
}
setlocale(LC_ALL, "");
#endif
}
void console_cleanup(console_state & con_st) {
// Reset console color
console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
#if !defined(_WIN32)
if (con_st.tty != nullptr) {
con_st.out = stdout;
fclose(con_st.tty);
con_st.tty = nullptr;
}
// Restore the terminal settings on POSIX systems
tcsetattr(STDIN_FILENO, TCSANOW, &con_st.prev_state);
#endif
}
/* Keep track of current color of output, and emit ANSI code if it changes. */
void set_console_color(console_state & con_st, console_color_t color) {
void console_set_color(console_state & con_st, console_color_t color) {
if (con_st.use_color && con_st.color != color) {
fflush(stdout);
switch(color) {
case CONSOLE_COLOR_DEFAULT:
printf(ANSI_COLOR_RESET);
fprintf(con_st.out, ANSI_COLOR_RESET);
break;
case CONSOLE_COLOR_PROMPT:
printf(ANSI_COLOR_YELLOW);
fprintf(con_st.out, ANSI_COLOR_YELLOW);
break;
case CONSOLE_COLOR_USER_INPUT:
printf(ANSI_BOLD ANSI_COLOR_GREEN);
fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_GREEN);
break;
case CONSOLE_COLOR_ERROR:
fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_RED);
break;
}
con_st.color = color;
fflush(con_st.out);
}
}
#if defined (_WIN32)
void win32_console_init(bool enable_color) {
unsigned long dwMode = 0;
void* hConOut = GetStdHandle((unsigned long)-11); // STD_OUTPUT_HANDLE (-11)
if (!hConOut || hConOut == (void*)-1 || !GetConsoleMode(hConOut, &dwMode)) {
hConOut = GetStdHandle((unsigned long)-12); // STD_ERROR_HANDLE (-12)
if (hConOut && (hConOut == (void*)-1 || !GetConsoleMode(hConOut, &dwMode))) {
hConOut = 0;
char32_t getchar32() {
#if defined(_WIN32)
HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE);
wchar_t high_surrogate = 0;
while (true) {
INPUT_RECORD record;
DWORD count;
if (!ReadConsoleInputW(hConsole, &record, 1, &count) || count == 0) {
return WEOF;
}
if (record.EventType == KEY_EVENT && record.Event.KeyEvent.bKeyDown) {
wchar_t wc = record.Event.KeyEvent.uChar.UnicodeChar;
if (wc == 0) {
continue;
}
if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
high_surrogate = wc;
continue;
} else if ((wc >= 0xDC00) && (wc <= 0xDFFF)) { // Check if wc is a low surrogate
if (high_surrogate != 0) { // Check if we have a high surrogate
return ((high_surrogate - 0xD800) << 10) + (wc - 0xDC00) + 0x10000;
}
}
high_surrogate = 0; // Reset the high surrogate
return static_cast<char32_t>(wc);
}
}
if (hConOut) {
// Enable ANSI colors on Windows 10+
if (enable_color && !(dwMode & 0x4)) {
SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4)
}
// Set console output codepage to UTF8
SetConsoleOutputCP(65001); // CP_UTF8
#else
wchar_t wc = getwchar();
if (static_cast<wint_t>(wc) == WEOF) {
return WEOF;
}
void* hConIn = GetStdHandle((unsigned long)-10); // STD_INPUT_HANDLE (-10)
if (hConIn && hConIn != (void*)-1 && GetConsoleMode(hConIn, &dwMode)) {
// Set console input codepage to UTF8
SetConsoleCP(65001); // CP_UTF8
#if WCHAR_MAX == 0xFFFF
if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
wchar_t low_surrogate = getwchar();
if ((low_surrogate >= 0xDC00) && (low_surrogate <= 0xDFFF)) { // Check if the next wchar is a low surrogate
return (static_cast<char32_t>(wc & 0x03FF) << 10) + (low_surrogate & 0x03FF) + 0x10000;
}
}
if ((wc >= 0xD800) && (wc <= 0xDFFF)) { // Invalid surrogate pair
return 0xFFFD; // Return the replacement character U+FFFD
}
#endif
return static_cast<char32_t>(wc);
#endif
}
void pop_cursor(console_state & con_st) {
#if defined(_WIN32)
if (con_st.hConsole != NULL) {
CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
GetConsoleScreenBufferInfo(con_st.hConsole, &bufferInfo);
COORD newCursorPosition = bufferInfo.dwCursorPosition;
if (newCursorPosition.X == 0) {
newCursorPosition.X = bufferInfo.dwSize.X - 1;
newCursorPosition.Y -= 1;
} else {
newCursorPosition.X -= 1;
}
SetConsoleCursorPosition(con_st.hConsole, newCursorPosition);
return;
}
#endif
putc('\b', con_st.out);
}
int estimateWidth(char32_t codepoint) {
#if defined(_WIN32)
return 1;
#else
return wcwidth(codepoint);
#endif
}
int put_codepoint(console_state & con_st, const char* utf8_codepoint, size_t length, int expectedWidth) {
#if defined(_WIN32)
CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
if (!GetConsoleScreenBufferInfo(con_st.hConsole, &bufferInfo)) {
// go with the default
return expectedWidth;
}
COORD initialPosition = bufferInfo.dwCursorPosition;
DWORD nNumberOfChars = length;
WriteConsole(con_st.hConsole, utf8_codepoint, nNumberOfChars, &nNumberOfChars, NULL);
CONSOLE_SCREEN_BUFFER_INFO newBufferInfo;
GetConsoleScreenBufferInfo(con_st.hConsole, &newBufferInfo);
// Figure out our real position if we're in the last column
if (utf8_codepoint[0] != 0x09 && initialPosition.X == newBufferInfo.dwSize.X - 1) {
DWORD nNumberOfChars;
WriteConsole(con_st.hConsole, &" \b", 2, &nNumberOfChars, NULL);
GetConsoleScreenBufferInfo(con_st.hConsole, &newBufferInfo);
}
int width = newBufferInfo.dwCursorPosition.X - initialPosition.X;
if (width < 0) {
width += newBufferInfo.dwSize.X;
}
return width;
#else
// we can trust expectedWidth if we've got one
if (expectedWidth >= 0 || con_st.tty == nullptr) {
fwrite(utf8_codepoint, length, 1, con_st.out);
return expectedWidth;
}
fputs("\033[6n", con_st.tty); // Query cursor position
int x1, x2, y1, y2;
int results = 0;
results = fscanf(con_st.tty, "\033[%d;%dR", &y1, &x1);
fwrite(utf8_codepoint, length, 1, con_st.tty);
fputs("\033[6n", con_st.tty); // Query cursor position
results += fscanf(con_st.tty, "\033[%d;%dR", &y2, &x2);
if (results != 4) {
return expectedWidth;
}
int width = x2 - x1;
if (width < 0) {
// Calculate the width considering text wrapping
struct winsize w;
ioctl(STDOUT_FILENO, TIOCGWINSZ, &w);
width += w.ws_col;
}
return width;
#endif
}
void replace_last(console_state & con_st, char ch) {
#if defined(_WIN32)
pop_cursor(con_st);
put_codepoint(con_st, &ch, 1, 1);
#else
fprintf(con_st.out, "\b%c", ch);
#endif
}
void append_utf8(char32_t ch, std::string & out) {
if (ch <= 0x7F) {
out.push_back(static_cast<unsigned char>(ch));
} else if (ch <= 0x7FF) {
out.push_back(static_cast<unsigned char>(0xC0 | ((ch >> 6) & 0x1F)));
out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
} else if (ch <= 0xFFFF) {
out.push_back(static_cast<unsigned char>(0xE0 | ((ch >> 12) & 0x0F)));
out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
} else if (ch <= 0x10FFFF) {
out.push_back(static_cast<unsigned char>(0xF0 | ((ch >> 18) & 0x07)));
out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 12) & 0x3F)));
out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
} else {
// Invalid Unicode code point
}
}
#endif
// Helper function to remove the last UTF-8 character from a string
void pop_back_utf8_char(std::string & line) {
if (line.empty()) {
return;
}
size_t pos = line.length() - 1;
// Find the start of the last UTF-8 character (checking up to 4 bytes back)
for (size_t i = 0; i < 3 && pos > 0; ++i, --pos) {
if ((line[pos] & 0xC0) != 0x80) break; // Found the start of the character
}
line.erase(pos);
}
bool console_readline(console_state & con_st, std::string & line) {
console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
if (con_st.out != stdout) {
fflush(stdout);
}
line.clear();
std::vector<int> widths;
bool is_special_char = false;
bool end_of_stream = false;
char32_t input_char;
while (true) {
fflush(con_st.out); // Ensure all output is displayed before waiting for input
input_char = getchar32();
if (input_char == '\r' || input_char == '\n') {
break;
}
if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D*/) {
end_of_stream = true;
break;
}
if (is_special_char) {
console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
replace_last(con_st, line.back());
is_special_char = false;
}
if (input_char == '\033') { // Escape sequence
char32_t code = getchar32();
if (code == '[' || code == 0x1B) {
// Discard the rest of the escape sequence
while ((code = getchar32()) != (char32_t) WEOF) {
if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~') {
break;
}
}
}
} else if (input_char == 0x08 || input_char == 0x7F) { // Backspace
if (!widths.empty()) {
int count;
do {
count = widths.back();
widths.pop_back();
// Move cursor back, print space, and move cursor back again
for (int i = 0; i < count; i++) {
replace_last(con_st, ' ');
pop_cursor(con_st);
}
pop_back_utf8_char(line);
} while (count == 0 && !widths.empty());
}
} else {
int offset = line.length();
append_utf8(input_char, line);
int width = put_codepoint(con_st, line.c_str() + offset, line.length() - offset, estimateWidth(input_char));
if (width < 0) {
width = 0;
}
widths.push_back(width);
}
if (!line.empty() && (line.back() == '\\' || line.back() == '/')) {
console_set_color(con_st, CONSOLE_COLOR_PROMPT);
replace_last(con_st, line.back());
is_special_char = true;
}
}
bool has_more = con_st.multiline_input;
if (is_special_char) {
replace_last(con_st, ' ');
pop_cursor(con_st);
char last = line.back();
line.pop_back();
if (last == '\\') {
line += '\n';
fputc('\n', con_st.out);
has_more = !has_more;
} else {
// llama will just eat the single space, it won't act as a space
if (line.length() == 1 && line.back() == ' ') {
line.clear();
pop_cursor(con_st);
}
has_more = false;
}
} else {
if (end_of_stream) {
has_more = false;
} else {
line += '\n';
fputc('\n', con_st.out);
}
}
fflush(con_st.out);
return has_more;
}

View File

@@ -8,47 +8,77 @@
#include <vector>
#include <random>
#include <thread>
#include <unordered_map>
#include <tuple>
#if !defined (_WIN32)
#include <stdio.h>
#include <termios.h>
#endif
//
// CLI argument parsing
//
int32_t get_num_physical_cores();
struct gpt_params {
int32_t seed = -1; // RNG seed
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
int32_t n_predict = 128; // new tokens to predict
int32_t repeat_last_n = 64; // last n tokens to penalize
int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
int32_t n_ctx = 512; // context size
int32_t n_batch = 8; // batch size for prompt processing
int32_t n_keep = 0; // number of tokens to keep from initial prompt
uint32_t seed = -1; // RNG seed
int32_t n_threads = get_num_physical_cores();
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
// sampling parameters
int32_t top_k = 40;
float top_p = 0.95f;
float temp = 0.80f;
float repeat_penalty = 1.10f;
std::string model = "models/lamma-7B/ggml-model.bin"; // model path
std::string prompt = "";
std::string input_prefix = ""; // string to prefix user inputs with
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
int32_t top_k = 40; // <= 0 to use vocab size
float top_p = 0.95f; // 1.0 = disabled
float tfs_z = 1.00f; // 1.0 = disabled
float typical_p = 1.00f; // 1.0 = disabled
float temp = 0.80f; // 1.0 = disabled
float repeat_penalty = 1.10f; // 1.0 = disabled
int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
float frequency_penalty = 0.00f; // 0.0 = disabled
float presence_penalty = 0.00f; // 0.0 = disabled
int mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate
std::string model = "models/7B/ggml-model.bin"; // model path
std::string model_alias = "unknown"; // model alias
std::string prompt = "";
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
std::string input_prefix = ""; // string to prefix user inputs with
std::string input_suffix = ""; // string to suffix user inputs with
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
std::string lora_adapter = ""; // lora adapter path
std::string lora_base = ""; // base model path for the lora adapter
bool low_vram = false; // if true, reduce VRAM usage at the cost of performance
bool memory_f16 = true; // use f16 instead of f32 for memory kv
bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs
bool interactive = false; // interactive mode
bool prompt_cache_all = false; // save user input and generations to prompt cache
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
bool embedding = false; // get only sentence embedding
bool interactive_start = false; // wait for user input immediately
bool interactive_first = false; // wait for user input immediately
bool multiline_input = false; // reverse the usage of `\`
bool instruct = false; // instruction mode (used for Alpaca models)
bool ignore_eos = false; // do not stop generating after eos
bool penalize_nl = true; // consider newlines as a repeatable token
bool perplexity = false; // compute perplexity over the prompt
bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory
bool mem_test = false; // compute maximum memory usage
bool numa = false; // attempt optimizations that help on some NUMA systems
bool export_cgraph = false; // export the computation graph
bool verbose_prompt = false; // print prompt tokens before generation
};
@@ -64,6 +94,12 @@ std::string gpt_random_prompt(std::mt19937 & rng);
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);
//
// Model utils
//
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params);
//
// Console utils
//
@@ -80,16 +116,25 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
enum console_color_t {
CONSOLE_COLOR_DEFAULT=0,
CONSOLE_COLOR_PROMPT,
CONSOLE_COLOR_USER_INPUT
CONSOLE_COLOR_USER_INPUT,
CONSOLE_COLOR_ERROR
};
struct console_state {
bool multiline_input = false;
bool use_color = false;
console_color_t color = CONSOLE_COLOR_DEFAULT;
FILE* out = stdout;
#if defined (_WIN32)
void* hConsole;
#else
FILE* tty = nullptr;
termios prev_state;
#endif
};
void set_console_color(console_state & con_st, console_color_t color);
#if defined (_WIN32)
void win32_console_init(bool enable_color);
#endif
void console_init(console_state & con_st);
void console_cleanup(console_state & con_st);
void console_set_color(console_state & con_st, console_color_t color);
bool console_readline(console_state & con_st, std::string & line);

4
examples/embd-input/.gitignore vendored Normal file
View File

@@ -0,0 +1,4 @@
PandaGPT
MiniGPT-4
*.pth

View File

@@ -0,0 +1,15 @@
set(TARGET embdinput)
add_library(${TARGET} embd-input-lib.cpp embd-input.h)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()
set(TARGET embd-input-test)
add_executable(${TARGET} embd-input-test.cpp)
target_link_libraries(${TARGET} PRIVATE common llama embdinput ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View File

@@ -0,0 +1,63 @@
### Examples for input embedding directly
## Requirement
build `libembdinput.so`
run the following comman in main dir (../../).
```
make
```
## [LLaVA](https://github.com/haotian-liu/LLaVA/) example (llava.py)
1. Obtian LLaVA model (following https://github.com/haotian-liu/LLaVA/ , use https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/).
2. Convert it to ggml format.
3. `llava_projection.pth` is [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin).
```
import torch
bin_path = "../LLaVA-13b-delta-v1-1/pytorch_model-00003-of-00003.bin"
pth_path = "./examples/embd_input/llava_projection.pth"
dic = torch.load(bin_path)
used_key = ["model.mm_projector.weight","model.mm_projector.bias"]
torch.save({k: dic[k] for k in used_key}, pth_path)
```
4. Check the path of LLaVA model and `llava_projection.pth` in `llava.py`.
## [PandaGPT](https://github.com/yxuansu/PandaGPT) example (panda_gpt.py)
1. Obtian PandaGPT lora model from https://github.com/yxuansu/PandaGPT. Rename the file to `adapter_model.bin`. Use [convert-lora-to-ggml.py](../../convert-lora-to-ggml.py) to convert it to ggml format.
The `adapter_config.json` is
```
{
"peft_type": "LORA",
"fan_in_fan_out": false,
"bias": null,
"modules_to_save": null,
"r": 32,
"lora_alpha": 32,
"lora_dropout": 0.1,
"target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"]
}
```
2. Papare the `vicuna` v0 model.
3. Obtain the [ImageBind](https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth) model.
4. Clone the PandaGPT source.
```
git clone https://github.com/yxuansu/PandaGPT
```
5. Install the requirement of PandaGPT.
6. Check the path of PandaGPT source, ImageBind model, lora model and vicuna model in panda_gpt.py.
## [MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4/) example (minigpt4.py)
1. Obtain MiniGPT-4 model from https://github.com/Vision-CAIR/MiniGPT-4/ and put it in `embd-input`.
2. Clone the MiniGPT-4 source.
```
git clone https://github.com/Vision-CAIR/MiniGPT-4/
```
3. Install the requirement of PandaGPT.
4. Papare the `vicuna` v0 model.
5. Check the path of MiniGPT-4 source, MiniGPT-4 model and vicuna model in `minigpt4.py`.

View File

@@ -0,0 +1,223 @@
// Defines sigaction on msys:
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include "embd-input.h"
#include <cassert>
#include <cinttypes>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
static llama_context ** g_ctx;
extern "C" {
struct MyModel* create_mymodel(int argc, char ** argv) {
gpt_params params;
if (gpt_params_parse(argc, argv, params) == false) {
return nullptr;
}
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
llama_init_backend(params.numa);
llama_model * model;
llama_context * ctx;
g_ctx = &ctx;
// load the model and apply lora adapter, if any
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
return nullptr;
}
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
}
struct MyModel * ret = new MyModel();
ret->ctx = ctx;
ret->params = params;
ret->n_past = 0;
// printf("ctx: %d\n", ret->ctx);
return ret;
}
void free_mymodel(struct MyModel * mymodel) {
llama_context * ctx = mymodel->ctx;
llama_print_timings(ctx);
llama_free(ctx);
delete mymodel;
}
bool eval_float(void * model, float * input, int N){
MyModel * mymodel = (MyModel*)model;
llama_context * ctx = mymodel->ctx;
gpt_params params = mymodel->params;
int n_emb = llama_n_embd(ctx);
int n_past = mymodel->n_past;
int n_batch = N; // params.n_batch;
for (int i = 0; i < (int) N; i += n_batch) {
int n_eval = (int) N - i;
if (n_eval > n_batch) {
n_eval = n_batch;
}
if (llama_eval_embd(ctx, (input+i*n_emb), n_eval, n_past, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return false;
}
n_past += n_eval;
}
mymodel->n_past = n_past;
return true;
}
bool eval_tokens(void * model, std::vector<llama_token> tokens) {
MyModel * mymodel = (MyModel* )model;
llama_context * ctx;
ctx = mymodel->ctx;
gpt_params params = mymodel->params;
int n_past = mymodel->n_past;
for (int i = 0; i < (int) tokens.size(); i += params.n_batch) {
int n_eval = (int) tokens.size() - i;
if (n_eval > params.n_batch) {
n_eval = params.n_batch;
}
if (llama_eval(ctx, &tokens[i], n_eval, n_past, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return false;
}
n_past += n_eval;
}
mymodel->n_past = n_past;
return true;
}
bool eval_id(struct MyModel* mymodel, int id) {
std::vector<llama_token> tokens;
tokens.push_back(id);
return eval_tokens(mymodel, tokens);
}
bool eval_string(struct MyModel * mymodel,const char* str){
llama_context * ctx = mymodel->ctx;
std::string str2 = str;
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx, str2, true);
eval_tokens(mymodel, embd_inp);
return true;
}
llama_token sampling_id(struct MyModel* mymodel) {
llama_context* ctx = mymodel->ctx;
gpt_params params = mymodel->params;
// int n_ctx = llama_n_ctx(ctx);
// out of user input, sample next token
const float temp = params.temp;
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
const float top_p = params.top_p;
const float tfs_z = params.tfs_z;
const float typical_p = params.typical_p;
// const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
// const float repeat_penalty = params.repeat_penalty;
// const float alpha_presence = params.presence_penalty;
// const float alpha_frequency = params.frequency_penalty;
const int mirostat = params.mirostat;
const float mirostat_tau = params.mirostat_tau;
const float mirostat_eta = params.mirostat_eta;
// const bool penalize_nl = params.penalize_nl;
llama_token id = 0;
{
auto logits = llama_get_logits(ctx);
auto n_vocab = llama_n_vocab(ctx);
// Apply params.logit_bias map
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
logits[it->first] += it->second;
}
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
// TODO: Apply penalties
// float nl_logit = logits[llama_token_nl()];
// auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
// llama_sample_repetition_penalty(ctx, &candidates_p,
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
// last_n_repeat, repeat_penalty);
// llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
// last_n_repeat, alpha_frequency, alpha_presence);
// if (!penalize_nl) {
// logits[llama_token_nl()] = nl_logit;
// }
if (temp <= 0) {
// Greedy sampling
id = llama_sample_token_greedy(ctx, &candidates_p);
} else {
if (mirostat == 1) {
static float mirostat_mu = 2.0f * mirostat_tau;
const int mirostat_m = 100;
llama_sample_temperature(ctx, &candidates_p, temp);
id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
} else if (mirostat == 2) {
static float mirostat_mu = 2.0f * mirostat_tau;
llama_sample_temperature(ctx, &candidates_p, temp);
id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
} else {
// Temperature sampling
llama_sample_top_k(ctx, &candidates_p, top_k, 1);
llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
llama_sample_typical(ctx, &candidates_p, typical_p, 1);
llama_sample_top_p(ctx, &candidates_p, top_p, 1);
llama_sample_temperature(ctx, &candidates_p, temp);
id = llama_sample_token(ctx, &candidates_p);
}
}
}
return id;
}
const char * sampling(struct MyModel * mymodel) {
llama_context * ctx = mymodel->ctx;
int id = sampling_id(mymodel);
static std::string ret;
if (id == llama_token_eos()) {
ret = "</s>";
} else {
ret = llama_token_to_str(ctx, id);
}
eval_id(mymodel, id);
return ret.c_str();
}
}

View File

@@ -0,0 +1,35 @@
#include "embd-input.h"
#include <stdlib.h>
#include <random>
#include <string.h>
int main(int argc, char** argv) {
auto mymodel = create_mymodel(argc, argv);
int N = 10;
int max_tgt_len = 500;
int n_embd = llama_n_embd(mymodel->ctx);
// add random float embd to test evaluation
float * data = new float[N*n_embd];
std::default_random_engine e;
std::uniform_real_distribution<float> u(0,1);
for (int i=0;i<N*n_embd;i++) {
data[i] = u(e);
}
eval_string(mymodel, "user: what is the color of the flag of UN?");
eval_float(mymodel, data, N);
eval_string(mymodel, "assistant:");
eval_string(mymodel, mymodel->params.prompt.c_str());
const char* tmp;
for (int i=0; i<max_tgt_len; i++) {
tmp = sampling(mymodel);
if (strcmp(tmp, "</s>")==0) break;
printf("%s", tmp);
fflush(stdout);
}
printf("\n");
free_mymodel(mymodel);
return 0;
}

View File

@@ -0,0 +1,28 @@
#ifndef _EMBD_INPUT_H_
#define _EMBD_INPUT_H_ 1
#include "common.h"
#include "llama.h"
#include "build-info.h"
extern "C" {
typedef struct MyModel {
llama_context* ctx;
gpt_params params;
int n_past = 0;
} MyModel;
struct MyModel* create_mymodel(int argc, char ** argv);
bool eval_float(void* model, float* input, int N);
bool eval_tokens(void* model, std::vector<llama_token> tokens);
bool eval_id(struct MyModel* mymodel, int id);
bool eval_string(struct MyModel* mymodel, const char* str);
const char * sampling(struct MyModel* mymodel);
llama_token sampling_id(struct MyModel* mymodel);
void free_mymodel(struct MyModel* mymodel);
}
#endif

View File

@@ -0,0 +1,71 @@
import ctypes
from ctypes import cdll, c_char_p, c_void_p, POINTER, c_float, c_int
import numpy as np
import os
libc = cdll.LoadLibrary("./libembdinput.so")
libc.sampling.restype=c_char_p
libc.create_mymodel.restype=c_void_p
libc.eval_string.argtypes=[c_void_p, c_char_p]
libc.sampling.argtypes=[c_void_p]
libc.eval_float.argtypes=[c_void_p, POINTER(c_float), c_int]
class MyModel:
def __init__(self, args):
argc = len(args)
c_str = [c_char_p(i.encode()) for i in args]
args_c = (c_char_p * argc)(*c_str)
self.model = c_void_p(libc.create_mymodel(argc, args_c))
self.max_tgt_len = 512
self.print_string_eval = True
def __del__(self):
libc.free_mymodel(self.model)
def eval_float(self, x):
libc.eval_float(self.model, x.astype(np.float32).ctypes.data_as(POINTER(c_float)), x.shape[1])
def eval_string(self, x):
libc.eval_string(self.model, x.encode()) # c_char_p(x.encode()))
if self.print_string_eval:
print(x)
def eval_token(self, x):
libc.eval_id(self.model, x)
def sampling(self):
s = libc.sampling(self.model)
return s
def stream_generate(self, end="</s>"):
ret = b""
end = end.encode()
for _ in range(self.max_tgt_len):
tmp = self.sampling()
ret += tmp
yield tmp
if ret.endswith(end):
break
def generate_with_print(self, end="</s>"):
ret = b""
for i in self.stream_generate(end=end):
ret += i
print(i.decode(errors="replace"), end="", flush=True)
print("")
return ret.decode(errors="replace")
def generate(self, end="</s>"):
text = b"".join(self.stream_generate(end=end))
return text.decode(errors="replace")
if __name__ == "__main__":
model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin", "-c", "2048"])
model.eval_string("""user: what is the color of the flag of UN?""")
x = np.random.random((5120,10))# , dtype=np.float32)
model.eval_float(x)
model.eval_string("""assistant:""")
for i in model.generate():
print(i.decode(errors="replace"), end="", flush=True)

View File

@@ -0,0 +1,70 @@
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
from embd_input import MyModel
import numpy as np
from torch import nn
import torch
from transformers import CLIPVisionModel, CLIPImageProcessor
from PIL import Image
# model parameters from 'liuhaotian/LLaVA-13b-delta-v1-1'
vision_tower = "openai/clip-vit-large-patch14"
select_hidden_state_layer = -2
# (vision_config.image_size // vision_config.patch_size) ** 2
image_token_len = (224//14)**2
class Llava:
def __init__(self, args):
self.image_processor = CLIPImageProcessor.from_pretrained(vision_tower)
self.vision_tower = CLIPVisionModel.from_pretrained(vision_tower)
self.mm_projector = nn.Linear(1024, 5120)
self.model = MyModel(["main", *args])
def load_projection(self, path):
state = torch.load(path)
self.mm_projector.load_state_dict({
"weight": state["model.mm_projector.weight"],
"bias": state["model.mm_projector.bias"]})
def chat(self, question):
self.model.eval_string("user: ")
self.model.eval_string(question)
self.model.eval_string("\nassistant: ")
return self.model.generate_with_print()
def chat_with_image(self, image, question):
with torch.no_grad():
embd_image = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
image_forward_out = self.vision_tower(embd_image.unsqueeze(0), output_hidden_states=True)
select_hidden_state = image_forward_out.hidden_states[select_hidden_state_layer]
image_feature = select_hidden_state[:, 1:]
embd_image = self.mm_projector(image_feature)
embd_image = embd_image.cpu().numpy()[0]
self.model.eval_string("user: ")
self.model.eval_token(32003-2) # im_start
self.model.eval_float(embd_image.T)
for i in range(image_token_len-embd_image.shape[0]):
self.model.eval_token(32003-3) # im_patch
self.model.eval_token(32003-1) # im_end
self.model.eval_string(question)
self.model.eval_string("\nassistant: ")
return self.model.generate_with_print()
if __name__=="__main__":
# model form liuhaotian/LLaVA-13b-delta-v1-1
a = Llava(["--model", "./models/ggml-llava-13b-v1.1.bin", "-c", "2048"])
# Extract from https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin.
# Also here can use pytorch_model-00003-of-00003.bin directly.
a.load_projection(os.path.join(
os.path.dirname(__file__) ,
"llava_projetion.pth"))
respose = a.chat_with_image(
Image.open("./media/llama1-logo.png").convert('RGB'),
"what is the text in the picture?")
respose
a.chat("what is the color of it?")

View File

@@ -0,0 +1,128 @@
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
from embd_input import MyModel
import numpy as np
from torch import nn
import torch
from PIL import Image
minigpt4_path = os.path.join(os.path.dirname(__file__), "MiniGPT-4")
sys.path.insert(0, minigpt4_path)
from minigpt4.models.blip2 import Blip2Base
from minigpt4.processors.blip_processors import Blip2ImageEvalProcessor
class MiniGPT4(Blip2Base):
"""
MiniGPT4 model from https://github.com/Vision-CAIR/MiniGPT-4
"""
def __init__(self,
args,
vit_model="eva_clip_g",
q_former_model="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth",
img_size=224,
drop_path_rate=0,
use_grad_checkpoint=False,
vit_precision="fp32",
freeze_vit=True,
freeze_qformer=True,
num_query_token=32,
llama_model="",
prompt_path="",
prompt_template="",
max_txt_len=32,
end_sym='\n',
low_resource=False, # use 8 bit and put vit in cpu
device_8bit=0
):
super().__init__()
self.img_size = img_size
self.low_resource = low_resource
self.preprocessor = Blip2ImageEvalProcessor(img_size)
print('Loading VIT')
self.visual_encoder, self.ln_vision = self.init_vision_encoder(
vit_model, img_size, drop_path_rate, use_grad_checkpoint, vit_precision
)
print('Loading VIT Done')
print('Loading Q-Former')
self.Qformer, self.query_tokens = self.init_Qformer(
num_query_token, self.visual_encoder.num_features
)
self.Qformer.cls = None
self.Qformer.bert.embeddings.word_embeddings = None
self.Qformer.bert.embeddings.position_embeddings = None
for layer in self.Qformer.bert.encoder.layer:
layer.output = None
layer.intermediate = None
self.load_from_pretrained(url_or_filename=q_former_model)
print('Loading Q-Former Done')
self.llama_proj = nn.Linear(
self.Qformer.config.hidden_size, 5120 # self.llama_model.config.hidden_size
)
self.max_txt_len = max_txt_len
self.end_sym = end_sym
self.model = MyModel(["main", *args])
# system promt
self.model.eval_string("Give the following image: <Img>ImageContent</Img>. "
"You will be able to see the image once I provide it to you. Please answer my questions."
"###")
def encode_img(self, image):
image = self.preprocessor(image)
image = image.unsqueeze(0)
device = image.device
if self.low_resource:
self.vit_to_cpu()
image = image.to("cpu")
with self.maybe_autocast():
image_embeds = self.ln_vision(self.visual_encoder(image)).to(device)
image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(device)
query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
query_output = self.Qformer.bert(
query_embeds=query_tokens,
encoder_hidden_states=image_embeds,
encoder_attention_mask=image_atts,
return_dict=True,
)
inputs_llama = self.llama_proj(query_output.last_hidden_state)
# atts_llama = torch.ones(inputs_llama.size()[:-1], dtype=torch.long).to(image.device)
return inputs_llama
def load_projection(self, path):
state = torch.load(path)["model"]
self.llama_proj.load_state_dict({
"weight": state["llama_proj.weight"],
"bias": state["llama_proj.bias"]})
def chat(self, question):
self.model.eval_string("Human: ")
self.model.eval_string(question)
self.model.eval_string("\n### Assistant:")
return self.model.generate_with_print(end="###")
def chat_with_image(self, image, question):
with torch.no_grad():
embd_image = self.encode_img(image)
embd_image = embd_image.cpu().numpy()[0]
self.model.eval_string("Human: <Img>")
self.model.eval_float(embd_image.T)
self.model.eval_string("</Img> ")
self.model.eval_string(question)
self.model.eval_string("\n### Assistant:")
return self.model.generate_with_print(end="###")
if __name__=="__main__":
a = MiniGPT4(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048"])
a.load_projection(os.path.join(
os.path.dirname(__file__) ,
"pretrained_minigpt4.pth"))
respose = a.chat_with_image(
Image.open("./media/llama1-logo.png").convert('RGB'),
"what is the text in the picture?")
a.chat("what is the color of it?")

View File

@@ -0,0 +1,98 @@
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
from embd_input import MyModel
import numpy as np
from torch import nn
import torch
# use PandaGPT path
panda_gpt_path = os.path.join(os.path.dirname(__file__), "PandaGPT")
imagebind_ckpt_path = "./models/panda_gpt/"
sys.path.insert(0, os.path.join(panda_gpt_path,"code","model"))
from ImageBind.models import imagebind_model
from ImageBind import data
ModalityType = imagebind_model.ModalityType
max_tgt_len = 400
class PandaGPT:
def __init__(self, args):
self.visual_encoder,_ = imagebind_model.imagebind_huge(pretrained=True, store_path=imagebind_ckpt_path)
self.visual_encoder.eval()
self.llama_proj = nn.Linear(1024, 5120) # self.visual_hidden_size, 5120)
self.max_tgt_len = max_tgt_len
self.model = MyModel(["main", *args])
self.generated_text = ""
self.device = "cpu"
def load_projection(self, path):
state = torch.load(path, map_location="cpu")
self.llama_proj.load_state_dict({
"weight": state["llama_proj.weight"],
"bias": state["llama_proj.bias"]})
def eval_inputs(self, inputs):
self.model.eval_string("<Img>")
embds = self.extract_multimoal_feature(inputs)
for i in embds:
self.model.eval_float(i.T)
self.model.eval_string("</Img> ")
def chat(self, question):
return self.chat_with_image(None, question)
def chat_with_image(self, inputs, question):
if self.generated_text == "":
self.model.eval_string("###")
self.model.eval_string(" Human: ")
if inputs:
self.eval_inputs(inputs)
self.model.eval_string(question)
self.model.eval_string("\n### Assistant:")
ret = self.model.generate_with_print(end="###")
self.generated_text += ret
return ret
def extract_multimoal_feature(self, inputs):
features = []
for key in ["image", "audio", "video", "thermal"]:
if key + "_paths" in inputs:
embeds = self.encode_data(key, inputs[key+"_paths"])
features.append(embeds)
return features
def encode_data(self, data_type, data_paths):
type_map = {
"image": ModalityType.VISION,
"audio": ModalityType.AUDIO,
"video": ModalityType.VISION,
"thermal": ModalityType.THERMAL,
}
load_map = {
"image": data.load_and_transform_vision_data,
"audio": data.load_and_transform_audio_data,
"video": data.load_and_transform_video_data,
"thermal": data.load_and_transform_thermal_data
}
load_function = load_map[data_type]
key = type_map[data_type]
inputs = {key: load_function(data_paths, self.device)}
with torch.no_grad():
embeddings = self.visual_encoder(inputs)
embeds = embeddings[key]
embeds = self.llama_proj(embeds).cpu().numpy()
return embeds
if __name__=="__main__":
a = PandaGPT(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048", "--lora", "./models/panda_gpt/ggml-adapter-model.bin","--temp", "0"])
a.load_projection("./models/panda_gpt/adapter_model.bin")
a.chat_with_image(
{"image_paths": ["./media/llama1-logo.png"]},
"what is the text in the picture? 'llama' or 'lambda'?")
a.chat("what is the color of it?")

View File

@@ -1,4 +1,7 @@
set(TARGET embedding)
add_executable(${TARGET} embedding.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ggml ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View File

@@ -1,3 +1,3 @@
# embedding
TODO
# embedding
TODO

View File

@@ -1,9 +1,15 @@
#include "common.h"
#include "llama.h"
#include "build-info.h"
#include <ctime>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
int main(int argc, char ** argv) {
gpt_params params;
params.model = "models/llama-7B/ggml-model.bin";
if (gpt_params_parse(argc, argv, params) == false) {
return 1;
@@ -16,37 +22,29 @@ int main(int argc, char ** argv) {
"expect poor results\n", __func__, params.n_ctx);
}
if (params.seed <= 0) {
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng);
}
llama_init_backend(params.numa);
llama_model * model;
llama_context * ctx;
// load the model
{
auto lparams = llama_context_default_params();
lparams.n_ctx = params.n_ctx;
lparams.n_parts = params.n_parts;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.logits_all = params.perplexity;
lparams.use_mlock = params.use_mlock;
lparams.embedding = params.embedding;
ctx = llama_init_from_file(params.model.c_str(), lparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return 1;
}
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
return 1;
}
// print system information
@@ -64,9 +62,6 @@ int main(int argc, char ** argv) {
// tokenize the prompt
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
// determine newline token
auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
if (params.verbose_prompt) {
fprintf(stderr, "\n");
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
@@ -96,6 +91,7 @@ int main(int argc, char ** argv) {
llama_print_timings(ctx);
llama_free(ctx);
llama_free_model(model);
return 0;
}

15
examples/gpt4all.sh Executable file
View File

@@ -0,0 +1,15 @@
#!/bin/bash
#
# Temporary script - will be removed in the future
#
cd `dirname $0`
cd ..
./main --color --instruct --threads 4 \
--model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
--file ./prompts/alpaca.txt \
--batch_size 8 --ctx_size 2048 -n -1 \
--repeat_last_n 64 --repeat_penalty 1.3 \
--n_predict 128 --temp 0.1 --top_k 40 --top_p 0.95

View File

@@ -0,0 +1,21 @@
# llama.cpp/example/jeopardy
This is pretty much just a straight port of aigoopy/llm-jeopardy/ with an added graph viewer.
The jeopardy test can be used to compare the fact knowledge of different models and compare them to eachother. This is in contrast to some other tests, which test logical deduction, creativity, writing skills, etc.
Step 1: Open jeopardy.sh and modify the following:
```
MODEL=(path to your model)
MODEL_NAME=(name of your model)
prefix=(basically, if you use vicuna it's Human: , if you use something else it might be User: , etc)
opts=(add -instruct here if needed for your model, or anything else you want to test out)
```
Step 2: Run `jeopardy.sh` from the llama.cpp folder
Step 3: Repeat steps 1 and 2 until you have all the results you need.
Step 4: Run `graph.py`, and follow the instructions. At the end, it will generate your final graph.
Note: The Human bar is based off of the full, original 100 sample questions. If you modify the question count or questions, it will not be valid.

View File

@@ -0,0 +1,57 @@
import matplotlib.pyplot as plt
import os
import csv
labels = []
numbers = []
numEntries = 1
rows = []
def bar_chart(numbers, labels, pos):
plt.bar(pos, numbers, color='blue')
plt.xticks(ticks=pos, labels=labels)
plt.title("Jeopardy Results by Model")
plt.xlabel("Model")
plt.ylabel("Questions Correct")
plt.show()
def calculatecorrect():
directory = os.fsencode("./examples/jeopardy/results/")
csv_reader = csv.reader(open("./examples/jeopardy/qasheet.csv", 'rt'), delimiter=',')
for row in csv_reader:
global rows
rows.append(row)
for listing in os.listdir(directory):
filename = os.fsdecode(listing)
if filename.endswith(".txt"):
file = open("./examples/jeopardy/results/" + filename, "rt")
global labels
global numEntries
global numbers
labels.append(filename[:-4])
numEntries += 1
i = 1
totalcorrect = 0
for line in file.readlines():
if line.strip() != "------":
print(line)
else:
print("Correct answer: " + rows[i][2] + "\n")
i += 1
print("Did the AI get the question right? (y/n)")
if input() == "y":
totalcorrect += 1
numbers.append(totalcorrect)
if __name__ == '__main__':
calculatecorrect()
pos = list(range(numEntries))
labels.append("Human")
numbers.append(48.11)
bar_chart(numbers, labels, pos)
print(labels)
print(numbers)

View File

@@ -0,0 +1,30 @@
#!/bin/bash
set -e
MODEL=./models/ggml-vicuna-13b-1.1-q4_0.bin
MODEL_NAME=Vicuna
# exec options
prefix="Human: " # Ex. Vicuna uses "Human: "
opts="--temp 0 -n 80" # additional flags
nl='
'
introduction="You will be playing a game of Jeopardy. Simply answer the question in the correct format (Ex. What is Paris, or Who is George Washington)."
# file options
question_file=./examples/jeopardy/questions.txt
touch ./examples/jeopardy/results/$MODEL_NAME.txt
output_file=./examples/jeopardy/results/$MODEL_NAME.txt
counter=1
echo 'Running'
while IFS= read -r question
do
exe_cmd="./main -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
echo $counter
echo "Current Question: $question"
eval "$exe_cmd"
echo -e "\n------" >> $output_file
counter=$((counter+1))
done < "$question_file"

View File

@@ -0,0 +1,103 @@
Index,Original Category,Original Correct Question,Model Prompt
1,The Oscars,Who is John Williams?,Which actor Born in 1932 was the son of a percussionist in the CBS radio orchestra has been nominated for 53 Oscars?
2,English Literature,What is Paradise Lost?,"What work in English Literature says: 'The mind is its own place, & in itself can make a heaven of hell, a hell of heaven. What matter where, if I be still the same'?"
3,Writers Lesser-Known Works,Who is Niccolò Machiavelli?,"Known for more philosophical works, he wrote the play 'La Mandragola', in which Florentines are rewarded for immoral actions?"
4,Exploration,What is Easter Island (Rapa Nui)?,"James Cook's account of a 1774 visit where records an object 'near 27 feet long, and upwards of 8 feet over the breast or shoulders'?"
5,The Bill of Rights,What is the Eighth Amendment?,England's 'Bloody Assizes' & a 1685 life sentence for perjury were 2 main origins of which amendment to the U.S. Constitution?
6,Nobel Peace Prize Winners,Who are Nelson Mandela & Desmond Tutu?,"Which nobel peace price winners each lived at times on Vilakazi St. in Soweto , so it claims to be the world's only street home to 2 Nobel Peace Prize winners?"
7,Famous Names,Who is Walt Disney?,"In 1966, the year of who's death did he share plans for an experimental prototype community in Florida?"
8,Geography,What is Colombia?,"Of the 13 nations through which the Equator passes, what is the only one whose coastline borders the Caribbean Sea?"
9,Fashion History,What are rhinestones?,"Which decorative items in fashion history get their name from their origin in the port city of Strasbourg, on the border of France & Germany?"
10,Movies of the 80s,What is Driving Miss Daisy?,What 1980's movie is based on an off-Broadway play with just 3 characters and won the Best Picture Oscar & the actors in all 3 roles were nominated?
11,Novelists,Who is John Grisham?,"A 2012 book review for which novelist noted subjects that 'sparked his ire': capital punishment, big tobacco & 'the plight of the unjustly convicted'?"
12,20th Century Eponyms,What is the Maginot Line?,"A 1940 headline about what 20th Century Eponym included 'failure', 'liability when it came to offense' & 'stout hearts no match for tanks'?"
13,City History,What is Stockholm?,"Over 700 years after its traditional 1252 founding date, what port city became associated with a psychological response?"
14,Brand Names,What is Jacuzzi?,"The success of what brand has its roots with a hydrotherapy pump its cofounder created for his son, who had arthritis?"
15,American Authors,Who is Washington Irving?,"In a periodical in 1807, what American Author called New York City 'Gotham, Gotham! Most enlightened of cities'?"
16,Symbols,What is “less than”?,What symbol is a rotated V in math and a feeling of some marginalized or underrepresented people in society?
17,Movie Theme Songs,Who is James Bond?,"Monty Norman, the composer of what character's theme, said the staccato riff conveyed sexiness, mystery & ruthlessness?"
18,American Novelists,Who is Joseph Heller?,"What American Novelist served with an airman named Yohannan in World War II & despite what readers might think, he said he enjoyed his service?"
19,Medieval Places,"What is Canterbury, England? (Canterbury Cathedral)","In what Medieval place did one of the participants in an 1170 event say, 'Let us away, knights; he will rise no more'?"
20,Countries of Africa,What is Morocco?,"At one time a province of the Roman Empire, what African country kingdom is known to Arabic scholars as Al-Maghrib Al-Aqsa, 'the far west'?"
21,Statehood,What is Wyoming?,Congress relented in 1890 after what prospective state said it would wait 100 years rather than come in without the women?
22,1980s Movies,What is Raiders of the Lost Ark?,"A writer & producer of what movie said he wanted it to be like a Western or James Bond film, 'only it takes place in the 30s'?"
23,Art Exhibitions,Who is Rembrandt?,In 1898 what's been called the first blockbuster art show was devoted to which artist & put on for Queen Wilhelmina's coronation?
24,Countries of the World,What is Mongolia?,"Part of the largest contiguous land empire during the 1200s & 1300s, today what is the world's second-largest landlocked country?"
25,Literature,What is “Howl”?,A 2006 book was titled 'The Poem That Changed America:' What 'Fifty Years Later'?
26,Invasions,Who is William of Orange?,"Backed by 14,000 troops, who invaded England to restore, in his words, its 'religion, laws, and liberties'?"
27,Landmarks,What is the Eiffel Tower?,"After its completion in the late 19th c., what was landmark was called 'a truly tragic street lamp' & a 'high & skinny pyramid of iron ladders'?"
28,Geographic Names the Same,What is Dover?,"The busiest passenger port in the U.K., what shares its name with a capital of one of the original 13 states?"
29,Names in the Bookstore,Who is Peter Mark Roget?,"This man made lists, perhaps to cope with depression; a set of lists he published in 1852 made whose name synonymous with a type of book?"
30,U.S. History,Who is Dr. Samuel Mudd?,"An 1869 presidential pardon was granted to which man, due in part to a plea by the Medical Society of Harford County, Maryland?"
31,American Literature,What is The Things They Carried?,"Letters, pocket knives, C rations & steel helmets are among the tangible items referred to in the title of what American literature modern war classic?"
32,Nonfiction,What is The Communist Manifesto,"What nonfiction book has the line, 'The discovery of America…opened up fresh ground for the rising bourgeoisie'?"
33, a new version was passed 81 years later,Laws in U.S. History,What is the Civil Rights Act?,,,,,,,,,,,,,,,,,,0, 2/3
34,Names of Myth,Who is Helen of Troy?,"Whose brothers, Castor & Pollux, saved her after Theseus stole her away as a kid; a larger force would seek her later in life?"
35,African Countries,What is Sudan?,"Once Africa's largest country in area, what African Country dropped to third in 2011 when a portion of it declared independence?"
36,The Ancient World,What is Alexandria?,"The ancient writer Galen said books on ships arriving to what city's port were seized, originals kept & copies returned?"
37,Famous Names,Who is Andy Warhol?,"For a special 1970s cookbook, who provided one simple recipea can of Campbell's tomato soup & 2 cans of milk?"
38,People & Places,What is Guam?,"Thought to descend from people of Southeast Asia, the Chamorro make up what U.S. territorys largest ethnic group?"
39,Current World Leaders,What is the Philippines?,"In office from 2022, the president of what country has taken so many foreign trips a play on his name is 'Ferdinand Magellan Jr.'?"
40,Writers & The South,Who is Tennessee Williams?,In 1939 which writer lived on Toulouse Street in the French Quarter & chose the professional name that bonded him to the South?
41,National Parks,What is Yellowstone?,"What National Park is named for a river indigenous people called Mi tse a-da-zi, translated by French-speaking trappers as 'Pierre Jaune'?"
42,Sports,Who are the Harlem Globetrotters?,"In 2010 who introduced the 4-point shot, 35 feet from the basket?"
43,The U.S. Military,What is “Top Gun”?,Losses over Asia in the 1960s led to the establishment of the program known as what at a San Diego naval base in 1969?
44,Art & Science,What is Halleys Comet?,"A craft that visited what was named for Giotto, based on the story that 680 years earlier, the painter depicted it as the Star of Bethlehem?"
45,Words From World War I,What is “tank”?,"In World War I, 'Cistern' & 'reservoir' were suggested names for what secret invention, but the British preferred this less clumsy monosyllable?"
46,European History,What is Holy Roman Emperor?,"Until 1806, some German nobles included among their honors the title of 'Elector' for their role in selecting this personage?"
47,Theater History,Who is Peter Pan?,"In 1904, wearing a harness, actress Nina Boucicault became the first to play what character onstage?"
48,European Cities,What is Aachen?,"Alphabetically the first German city in encyclopedias, what was also the first one taken by the Allies in World War II?"
49,Word Origins,What is mantra?,This Sanskrit word referring to a spoken word or phrase comes from a word for 'to think'?
50,Inventions,What is barbed wire?,1917's 'Elements of Trench Warfare' said what Old West invention was 'difficult to destroy' & 'difficult to get through'?
51,World War II,What is Schindlers list?,"Mimi Reinhard, who never learned to type using more than 2 fingers, produced what in World War II with 1,100 names, including hers?"
52, their offspring was the source of this mythical object,Mythology,What is the Golden Fleece?
53,Literature,What is Pride and Prejudice?,"Published in 2011, P.D. James' final novel, 'Death Comes to Pemberley', was a sequel to what novel from 200 years earlier?"
54, only these 2 west of the Mississippi River border each other,U.S. State Names,What are Oregon & Nevada?
55,Word Origins,What is passion?,"Originally relating to a story of suffering, what word now more commonly refers to strong emotion of any kind?"
56,World Cinema,What is La Vie en Rose?,"The 2007 biopic called 'La Môme' in France, meaning 'The Kid', was released in the U.S. under what other French title?"
57,History,What is Santa Maria?,"Returning home in 1493, Columbus stopped in the Azores at an island with what name, also something he'd lost off the Haiti coast?"
58,Landmarks,What is a kremlin?,Pskov & Nizhny Novgorod are 2 of the cities that have a fortress called what?
59,Foreign-Born Authors,Who is Vladimir Nabokov?,In the 1950s the New York Times said what author 'is writing about all lust' & his lecherous narrator 'is all of us'?
60,Astronomy & Geography,What is Capricorn?,"At the winter solstice, the sun is in Sagittarius; it once appeared in what constellation, giving a geographic feature its name?"
61,Television,What is Law & Order?,"Mike Post combined the sound of a slamming jail door, an anvil & 100 men stomping on a floor for what television series that debuted in 1990?"
62,British Landmarks,What is the Tower of London?,"Like Sir Thomas More, 3 16th century English queens are buried at what British location?"
63,Early American History,What are witches?,"In 1692 Increase Mather wrote, 'It were better that ten suspected' of these who 'escape, than that one innocent person … be condemned'?"
64,Geography Mnemonics,What are Arkansas and Louisiana?,"The Geography Mnemonic Mimal, sometimes said to be the silhouette of a chef or elf, stands for Minnesota, Iowa, Missouri, and what other 2 states?"
65,Business Milestones,What is the Ford Model T?,"What was first sold in 1908, at a price equivalent to about $27,000 today?"
66,In The Bookstore,Who is Tom Clancy?,The name of what author dead since 2013 now appears on books written by a former U.S. marshal & a former Apache helicopter pilot?
67,Historic Art,What is the Bayeux Tapestry?,The artwork once known in France as 'la tapisserie de la Reine Mathilde' is better known as what?
68,Pop Stars,Who is Madonna?,In 2022 which pop star became the first woman to have a Billboard Top 10 album in 5 decades starting with the 1980s?
69,Classic Tale Characters,Who is Scheherazade?,"In one 19th century translation, what female classic tale character 'perceived the dawn of day and ceased' speaking nearly 1,000 times?"
70,USA,What is Jack Daniels?,"Ironically, though what company founded in the 1860s is Moore County, Tennessee's largest employer, Moore is a dry county?"
71,Historic People,Who was William Bligh?,"After a 1789 event, who wrote, 'My first determination was to seek a supply of…water at Tofoa, & afterwards to sail for Tongataboo'?"
72,The Movies,What is The Godfather?,Laurence Olivier & Ernest Borgnine were considered for the lead role & Sergio Leone to direct for what film that turned 50 in 2022?
73,Continental Geography,What is Colombia?,"Until a 1903 secession, what country's contiguous territory spanned 2 continents?"
74,Foreign-Born Authors,Who is Isabel Allende?,"Early in her career which foreign-born author translated romance novels into Spanish, often changing the dialogue to make the heroines smarter?"
75,Historic Crimes,What is the Mona Lisa?,"Saying it was stolen by Napoleon, self-styled Italian patriot Vincenzo Peruggia took what in 1911?"
76,U.S. Bodies of Water,What is Lake Mead?,"Continuing a downward trend, in July 2022 what US body of water was at 27% capacity, its lowest level since 1937 when it was first being filled?"
77,Gods & Goddesses,Who is Aurora (or Eos)?,"Each morning which goddess began her ride in her chariot across the sky ahead of her brother Sol, or Helios?"
78,America At War,What is the Battle of New Orleans?,"Until the Civil War, the Jan. 8 date of what American battle of dubious military importance but big morale value was a national holiday?"
79,Childrens Books,What is The Velveteen Rabbit?,"Which children's book title character is told 'By the time you are real, most of your hair has been loved off your eyes drop out & you get shabby'?"
80,TV Finales,What is Grace and Frankie?,"In a TV reunion over 40 years in the making, Dolly Parton appeared as an angel named Agnes in the final episode of what comedy in 2022?"
81,American Poems,Who is Evangeline?,"In an 1847 American poem what character sees her town of Grand-Pré burned, but finally reunites with her beau for a kiss before his death?"
82,Famous Names,Who is Banksy?,"In 2001 who published a book called 'Banging Your Head Against a Brick Wall'; in 2002, 'Existencilism'?"
83,Childrens Lit,What is Charlottes Web?,The title object of what childrens book 'never looked more beautiful each strand held dozens of bright drops of early morning dew'?
84,Classic Songs,What is “Here Comes Santa Claus”?,The shouts of excited children at a 1946 holiday parade are said to have inspired what perennial classic song favorite?
85,Brand Names,What are Milk Duds?,"Unable to make what candies perfectly round, the confectioner embraced this flawed name for the product?"
86,Countries of the World,What is Italy?,"What country is home to 58 UNESCO World Heritage Sites, more than any other country; the sites include a volcano & a lagoon?"
87,Action Movies,What is Die Hard?,"What action movie's last line is 'If this is their idea of Christmas, I gotta be here for New Years'?"
88,Presidential Facts,Who is Woodrow Wilson?,Only 3 presidents have married while in office— John Tyler was the first & which one was the last?
89,19th Century Americans,Who is Frederick Douglass?,"Demonstrating the dignity & humanity of Black Americans, who sat for 160 known photographs, the most of any American in the 19th century?"
90,Latin Phrases,What is “quid pro quo”?,"Originally, which Latin 3-word phrase referred to when a doctor or apothecary substituted one medicine for another?"
91,1970s Movies,What is Monty Python and the Holy Grail?,The 1975 premiere of what movie comedy advertised free coconuts for the first thousand in the audience?
92,Names The Same,What is Manhattan?,"A cocktail, an island & a WWII venture originally called 'Development of Substitute Materials' all bear what name?"
93,U.S. Presidents,Who is Calvin Coolidge?,"Which US President was sworn in twice as President within 2 years, first by his father & then later by a former U.S. President?"
94,Plays,What is The Tempest?,A 1609 story in which an exiled king of Bulgaria creates a sea palace with his magic may have inspired the plot of what play?
95,Landmarks,What is the Berlin Wall?,"In 2009, during a 20th anniversary celebration, what landmark was called 'an edifice of fear. On Nov. 9, it became a place of joy'?"
96,World Capitals,"What is Vienna, Austria?","Among what world capital's nicknames are the 'City of Classical Music' &, possibly in honor of a famous resident from 1860 to 1938, the 'City of Dreams'?"
97,Language & Its Meanings,What is a night owl?,"Now meaning someone with nocturnal habits, what catches a sleeping dove in Shakespeare's 'Lucrece'?"
98,Flags of Our Hemisphere,What is Brazil?,"The stars on what country's flag represent states, 26 of them; unlike the USA's, its 'federal district' gets its own 27th star?"
99,Names in U.S. History,Who is Oliver Brown?,What father was the only man among the 13 plaintiffs in a US class-action case filed in 1951?
100,Childrens Authors,"Who is Sarah? (from Sarah, Plain and Tall)","Reversing the story of what heroine she created, childrens author Patricia Maclachlan was born on the prairie but spent much of her life in New England?"
,,,
TOTALS,,,
1 Index Original Category Original Correct Question Model Prompt
2 1 The Oscars Who is John Williams? Which actor Born in 1932 was the son of a percussionist in the CBS radio orchestra has been nominated for 53 Oscars?
3 2 English Literature What is Paradise Lost? What work in English Literature says: 'The mind is its own place, & in itself can make a heaven of hell, a hell of heaven. What matter where, if I be still the same'?
4 3 Writers’ Lesser-Known Works Who is Niccolò Machiavelli? Known for more philosophical works, he wrote the play 'La Mandragola', in which Florentines are rewarded for immoral actions?
5 4 Exploration What is Easter Island (Rapa Nui)? James Cook's account of a 1774 visit where records an object 'near 27 feet long, and upwards of 8 feet over the breast or shoulders'?
6 5 The Bill of Rights What is the Eighth Amendment? England's 'Bloody Assizes' & a 1685 life sentence for perjury were 2 main origins of which amendment to the U.S. Constitution?
7 6 Nobel Peace Prize Winners Who are Nelson Mandela & Desmond Tutu? Which nobel peace price winners each lived at times on Vilakazi St. in Soweto , so it claims to be the world's only street home to 2 Nobel Peace Prize winners?
8 7 Famous Names Who is Walt Disney? In 1966, the year of who's death did he share plans for an experimental prototype community in Florida?
9 8 Geography What is Colombia? Of the 13 nations through which the Equator passes, what is the only one whose coastline borders the Caribbean Sea?
10 9 Fashion History What are rhinestones? Which decorative items in fashion history get their name from their origin in the port city of Strasbourg, on the border of France & Germany?
11 10 Movies of the ’80s What is Driving Miss Daisy? What 1980's movie is based on an off-Broadway play with just 3 characters and won the Best Picture Oscar & the actors in all 3 roles were nominated?
12 11 Novelists Who is John Grisham? A 2012 book review for which novelist noted subjects that 'sparked his ire': capital punishment, big tobacco & 'the plight of the unjustly convicted'?
13 12 20th Century Eponyms What is the Maginot Line? A 1940 headline about what 20th Century Eponym included 'failure', 'liability when it came to offense' & 'stout hearts no match for tanks'?
14 13 City History What is Stockholm? Over 700 years after its traditional 1252 founding date, what port city became associated with a psychological response?
15 14 Brand Names What is Jacuzzi? The success of what brand has its roots with a hydrotherapy pump its cofounder created for his son, who had arthritis?
16 15 American Authors Who is Washington Irving? In a periodical in 1807, what American Author called New York City 'Gotham, Gotham! Most enlightened of cities'?
17 16 Symbols What is “less than”? What symbol is a rotated V in math and a feeling of some marginalized or underrepresented people in society?
18 17 Movie Theme Songs Who is James Bond? Monty Norman, the composer of what character's theme, said the staccato riff conveyed sexiness, mystery & ruthlessness?
19 18 American Novelists Who is Joseph Heller? What American Novelist served with an airman named Yohannan in World War II & despite what readers might think, he said he enjoyed his service?
20 19 Medieval Places What is Canterbury, England? (Canterbury Cathedral) In what Medieval place did one of the participants in an 1170 event say, 'Let us away, knights; he will rise no more'?
21 20 Countries of Africa What is Morocco? At one time a province of the Roman Empire, what African country kingdom is known to Arabic scholars as Al-Maghrib Al-Aqsa, 'the far west'?
22 21 Statehood What is Wyoming? Congress relented in 1890 after what prospective state said it would wait 100 years rather than come in without the women?
23 22 1980s Movies What is Raiders of the Lost Ark? A writer & producer of what movie said he wanted it to be like a Western or James Bond film, 'only it takes place in the 30s'?
24 23 Art Exhibitions Who is Rembrandt? In 1898 what's been called the first blockbuster art show was devoted to which artist & put on for Queen Wilhelmina's coronation?
25 24 Countries of the World What is Mongolia? Part of the largest contiguous land empire during the 1200s & 1300s, today what is the world's second-largest landlocked country?
26 25 Literature What is “Howl”? A 2006 book was titled 'The Poem That Changed America:' What 'Fifty Years Later'?
27 26 Invasions Who is William of Orange? Backed by 14,000 troops, who invaded England to restore, in his words, its 'religion, laws, and liberties'?
28 27 Landmarks What is the Eiffel Tower? After its completion in the late 19th c., what was landmark was called 'a truly tragic street lamp' & a 'high & skinny pyramid of iron ladders'?
29 28 Geographic Name’s the Same What is Dover? The busiest passenger port in the U.K., what shares its name with a capital of one of the original 13 states?
30 29 Names in the Bookstore Who is Peter Mark Roget? This man made lists, perhaps to cope with depression; a set of lists he published in 1852 made whose name synonymous with a type of book?
31 30 U.S. History Who is Dr. Samuel Mudd? An 1869 presidential pardon was granted to which man, due in part to a plea by the Medical Society of Harford County, Maryland?
32 31 American Literature What is The Things They Carried? Letters, pocket knives, C rations & steel helmets are among the tangible items referred to in the title of what American literature modern war classic?
33 32 Nonfiction What is The Communist Manifesto What nonfiction book has the line, 'The discovery of America…opened up fresh ground for the rising bourgeoisie'?
34 33 a new version was passed 81 years later Laws in U.S. History What is the Civil Rights Act? 0 2/3
35 34 Names of Myth Who is Helen of Troy? Whose brothers, Castor & Pollux, saved her after Theseus stole her away as a kid; a larger force would seek her later in life?
36 35 African Countries What is Sudan? Once Africa's largest country in area, what African Country dropped to third in 2011 when a portion of it declared independence?
37 36 The Ancient World What is Alexandria? The ancient writer Galen said books on ships arriving to what city's port were seized, originals kept & copies returned?
38 37 Famous Names Who is Andy Warhol? For a special 1970s cookbook, who provided one simple recipe–a can of Campbell's tomato soup & 2 cans of milk?
39 38 People & Places What is Guam? Thought to descend from people of Southeast Asia, the Chamorro make up what U.S. territory’s largest ethnic group?
40 39 Current World Leaders What is the Philippines? In office from 2022, the president of what country has taken so many foreign trips a play on his name is 'Ferdinand Magellan Jr.'?
41 40 Writers & The South Who is Tennessee Williams? In 1939 which writer lived on Toulouse Street in the French Quarter & chose the professional name that bonded him to the South?
42 41 National Parks What is Yellowstone? What National Park is named for a river indigenous people called Mi tse a-da-zi, translated by French-speaking trappers as 'Pierre Jaune'?
43 42 Sports Who are the Harlem Globetrotters? In 2010 who introduced the 4-point shot, 35 feet from the basket?
44 43 The U.S. Military What is “Top Gun”? Losses over Asia in the 1960s led to the establishment of the program known as what at a San Diego naval base in 1969?
45 44 Art & Science What is Halley’s Comet? A craft that visited what was named for Giotto, based on the story that 680 years earlier, the painter depicted it as the Star of Bethlehem?
46 45 Words From World War I What is “tank”? In World War I, 'Cistern' & 'reservoir' were suggested names for what secret invention, but the British preferred this less clumsy monosyllable?
47 46 European History What is Holy Roman Emperor? Until 1806, some German nobles included among their honors the title of 'Elector' for their role in selecting this personage?
48 47 Theater History Who is Peter Pan? In 1904, wearing a harness, actress Nina Boucicault became the first to play what character onstage?
49 48 European Cities What is Aachen? Alphabetically the first German city in encyclopedias, what was also the first one taken by the Allies in World War II?
50 49 Word Origins What is mantra? This Sanskrit word referring to a spoken word or phrase comes from a word for 'to think'?
51 50 Inventions What is barbed wire? 1917's 'Elements of Trench Warfare' said what Old West invention was 'difficult to destroy' & 'difficult to get through'?
52 51 World War II What is Schindler’s list? Mimi Reinhard, who never learned to type using more than 2 fingers, produced what in World War II with 1,100 names, including hers?
53 52 their offspring was the source of this mythical object Mythology What is the Golden Fleece?
54 53 Literature What is Pride and Prejudice? Published in 2011, P.D. James' final novel, 'Death Comes to Pemberley', was a sequel to what novel from 200 years earlier?
55 54 only these 2 west of the Mississippi River border each other U.S. State Names What are Oregon & Nevada?
56 55 Word Origins What is passion? Originally relating to a story of suffering, what word now more commonly refers to strong emotion of any kind?
57 56 World Cinema What is La Vie en Rose? The 2007 biopic called 'La Môme' in France, meaning 'The Kid', was released in the U.S. under what other French title?
58 57 History What is Santa Maria? Returning home in 1493, Columbus stopped in the Azores at an island with what name, also something he'd lost off the Haiti coast?
59 58 Landmarks What is a kremlin? Pskov & Nizhny Novgorod are 2 of the cities that have a fortress called what?
60 59 Foreign-Born Authors Who is Vladimir Nabokov? In the 1950s the New York Times said what author 'is writing about all lust' & his lecherous narrator 'is all of us'?
61 60 Astronomy & Geography What is Capricorn? At the winter solstice, the sun is in Sagittarius; it once appeared in what constellation, giving a geographic feature its name?
62 61 Television What is Law & Order? Mike Post combined the sound of a slamming jail door, an anvil & 100 men stomping on a floor for what television series that debuted in 1990?
63 62 British Landmarks What is the Tower of London? Like Sir Thomas More, 3 16th century English queens are buried at what British location?
64 63 Early American History What are witches? In 1692 Increase Mather wrote, 'It were better that ten suspected' of these who 'escape, than that one innocent person … be condemned'?
65 64 Geography Mnemonics What are Arkansas and Louisiana? The Geography Mnemonic Mimal, sometimes said to be the silhouette of a chef or elf, stands for Minnesota, Iowa, Missouri, and what other 2 states?
66 65 Business Milestones What is the Ford Model T? What was first sold in 1908, at a price equivalent to about $27,000 today?
67 66 In The Bookstore Who is Tom Clancy? The name of what author dead since 2013 now appears on books written by a former U.S. marshal & a former Apache helicopter pilot?
68 67 Historic Art What is the Bayeux Tapestry? The artwork once known in France as 'la tapisserie de la Reine Mathilde' is better known as what?
69 68 Pop Stars Who is Madonna? In 2022 which pop star became the first woman to have a Billboard Top 10 album in 5 decades starting with the 1980s?
70 69 Classic Tale Characters Who is Scheherazade? In one 19th century translation, what female classic tale character 'perceived the dawn of day and ceased' speaking nearly 1,000 times?
71 70 USA What is Jack Daniel’s? Ironically, though what company founded in the 1860s is Moore County, Tennessee's largest employer, Moore is a dry county?
72 71 Historic People Who was William Bligh? After a 1789 event, who wrote, 'My first determination was to seek a supply of…water at Tofoa, & afterwards to sail for Tongataboo'?
73 72 The Movies What is The Godfather? Laurence Olivier & Ernest Borgnine were considered for the lead role & Sergio Leone to direct for what film that turned 50 in 2022?
74 73 Continental Geography What is Colombia? Until a 1903 secession, what country's contiguous territory spanned 2 continents?
75 74 Foreign-Born Authors Who is Isabel Allende? Early in her career which foreign-born author translated romance novels into Spanish, often changing the dialogue to make the heroines smarter?
76 75 Historic Crimes What is the Mona Lisa? Saying it was stolen by Napoleon, self-styled Italian patriot Vincenzo Peruggia took what in 1911?
77 76 U.S. Bodies of Water What is Lake Mead? Continuing a downward trend, in July 2022 what US body of water was at 27% capacity, its lowest level since 1937 when it was first being filled?
78 77 Gods & Goddesses Who is Aurora (or Eos)? Each morning which goddess began her ride in her chariot across the sky ahead of her brother Sol, or Helios?
79 78 America At War What is the Battle of New Orleans? Until the Civil War, the Jan. 8 date of what American battle of dubious military importance but big morale value was a national holiday?
80 79 Children’s Books What is The Velveteen Rabbit? Which children's book title character is told 'By the time you are real, most of your hair has been loved off your eyes drop out & you get shabby'?
81 80 TV Finales What is Grace and Frankie? In a TV reunion over 40 years in the making, Dolly Parton appeared as an angel named Agnes in the final episode of what comedy in 2022?
82 81 American Poems Who is Evangeline? In an 1847 American poem what character sees her town of Grand-Pré burned, but finally reunites with her beau for a kiss before his death?
83 82 Famous Names Who is Banksy? In 2001 who published a book called 'Banging Your Head Against a Brick Wall'; in 2002, 'Existencilism'?
84 83 Children’s Lit What is Charlotte’s Web? The title object of what childrens book 'never looked more beautiful each strand held dozens of bright drops of early morning dew'?
85 84 Classic Songs What is “Here Comes Santa Claus”? The shouts of excited children at a 1946 holiday parade are said to have inspired what perennial classic song favorite?
86 85 Brand Names What are Milk Duds? Unable to make what candies perfectly round, the confectioner embraced this flawed name for the product?
87 86 Countries of the World What is Italy? What country is home to 58 UNESCO World Heritage Sites, more than any other country; the sites include a volcano & a lagoon?
88 87 Action Movies What is Die Hard? What action movie's last line is 'If this is their idea of Christmas, I gotta be here for New Years'?
89 88 Presidential Facts Who is Woodrow Wilson? Only 3 presidents have married while in office— John Tyler was the first & which one was the last?
90 89 19th Century Americans Who is Frederick Douglass? Demonstrating the dignity & humanity of Black Americans, who sat for 160 known photographs, the most of any American in the 19th century?
91 90 Latin Phrases What is “quid pro quo”? Originally, which Latin 3-word phrase referred to when a doctor or apothecary substituted one medicine for another?
92 91 1970s Movies What is Monty Python and the Holy Grail? The 1975 premiere of what movie comedy advertised free coconuts for the first thousand in the audience?
93 92 Name’s The Same What is Manhattan? A cocktail, an island & a WWII venture originally called 'Development of Substitute Materials' all bear what name?
94 93 U.S. Presidents Who is Calvin Coolidge? Which US President was sworn in twice as President within 2 years, first by his father & then later by a former U.S. President?
95 94 Plays What is The Tempest? A 1609 story in which an exiled king of Bulgaria creates a sea palace with his magic may have inspired the plot of what play?
96 95 Landmarks What is the Berlin Wall? In 2009, during a 20th anniversary celebration, what landmark was called 'an edifice of fear. On Nov. 9, it became a place of joy'?
97 96 World Capitals What is Vienna, Austria? Among what world capital's nicknames are the 'City of Classical Music' &, possibly in honor of a famous resident from 1860 to 1938, the 'City of Dreams'?
98 97 Language & Its Meanings What is a night owl? Now meaning someone with nocturnal habits, what catches a sleeping dove in Shakespeare's 'Lucrece'?
99 98 Flags of Our Hemisphere What is Brazil? The stars on what country's flag represent states, 26 of them; unlike the USA's, its 'federal district' gets its own 27th star?
100 99 Names in U.S. History Who is Oliver Brown? What father was the only man among the 13 plaintiffs in a US class-action case filed in 1951?
101 100 Children’s Authors Who is Sarah? (from Sarah, Plain and Tall) Reversing the story of what heroine she created, childrens author Patricia Maclachlan was born on the prairie but spent much of her life in New England?
102
103 TOTALS

View File

@@ -0,0 +1,100 @@
Which man born in 1932 was the son of a percussionist in the CBS radio orchestra has been nominated for 53 Oscars?
What work in English Literature says: 'The mind is its own place, & in itself can make a heaven of hell, a hell of heaven. What matter where, if I be still the same'?
Known for more philosophical works, he wrote the play 'La Mandragola', in which Florentines are rewarded for immoral actions?
James Cook's account of a 1774 visit where records an object 'near 27 feet long, and upwards of 8 feet over the breast or shoulders'?
England's 'Bloody Assizes' & a 1685 life sentence for perjury were 2 main origins of which amendment to the U.S. Constitution?
Which nobel peace price winners each lived at times on Vilakazi St. in Soweto , so it claims to be the world's only street home to 2 Nobel Peace Prize winners?
In 1966, the year of who's death did he share plans for an experimental prototype community in Florida?
Of the 13 nations through which the Equator passes, what is the only one whose coastline borders the Caribbean Sea?
Which decorative items in fashion history get their name from their origin in the port city of Strasbourg, on the border of France & Germany?
What 1980's movie is based on an off-Broadway play with just 3 characters and won the Best Picture Oscar & the actors in all 3 roles were nominated?
A 2012 book review for which novelist noted subjects that 'sparked his ire': capital punishment, big tobacco & 'the plight of the unjustly convicted'?
A 1940 headline about what 20th Century Eponym included 'failure', 'liability when it came to offense' & 'stout hearts no match for tanks'?
Over 700 years after its traditional 1252 founding date, what port city became associated with a psychological response?
The success of what brand has its roots with a hydrotherapy pump its cofounder created for his son, who had arthritis?
In a periodical in 1807, what American Author called New York City 'Gotham, Gotham! Most enlightened of cities'?
What symbol is a rotated V in math and a feeling of some marginalized or underrepresented people in society?
Monty Norman, the composer of what character's theme, said the staccato riff conveyed sexiness, mystery & ruthlessness?
What American Novelist served with an airman named Yohannan in World War II & despite what readers might think, he said he enjoyed his service?
In what Medieval place did one of the participants in an 1170 event say, 'Let us away, knights; he will rise no more'?
At one time a province of the Roman Empire, what African country kingdom is known to Arabic scholars as Al-Maghrib Al-Aqsa, 'the far west'?
Congress relented in 1890 after what prospective state said it would wait 100 years rather than come in without the women?
A writer & producer of what movie said he wanted it to be like a Western or James Bond film, 'only it takes place in the 30s'?
In 1898 what's been called the first blockbuster art show was devoted to which artist & put on for Queen Wilhelmina's coronation?
Part of the largest contiguous land empire during the 1200s & 1300s, today what is the world's second-largest landlocked country?
A 2006 book was titled 'The Poem That Changed America:' What 'Fifty Years Later'?
Backed by 14,000 troops, who invaded England to restore, in his words, its 'religion, laws, and liberties'?
After its completion in the late 19th c., what was landmark was called 'a truly tragic street lamp' & a 'high & skinny pyramid of iron ladders'?
The busiest passenger port in the U.K., what shares its name with a capital of one of the original 13 states?
This man made lists, perhaps to cope with depression; a set of lists he published in 1852 made whose name synonymous with a type of book?
An 1869 presidential pardon was granted to which man, due in part to a plea by the Medical Society of Harford County, Maryland?
Letters, pocket knives, C rations & steel helmets are among the tangible items referred to in the title of what American literature modern war classic?
What nonfiction book has the line, 'The discovery of America…opened up fresh ground for the rising bourgeoisie'?
A radical Republican championed what 1875 act but the Supreme Court struck it down in 1883; a new version was passed 81 years later?
Whose brothers, Castor & Pollux, saved her after Theseus stole her away as a kid; a larger force would seek her later in life?
Once Africa's largest country in area, what African Country dropped to third in 2011 when a portion of it declared independence?
The ancient writer Galen said books on ships arriving to what city's port were seized, originals kept & copies returned?
For a special 1970s cookbook, who provided one simple recipea can of Campbell's tomato soup & 2 cans of milk?
Thought to descend from people of Southeast Asia, the Chamorro make up what U.S. territorys largest ethnic group?
In office from 2022, the president of what country has taken so many foreign trips a play on his name is 'Ferdinand Magellan Jr.'?
In 1939 which writer lived on Toulouse Street in the French Quarter & chose the professional name that bonded him to the South?
What National Park is named for a river indigenous people called Mi tse a-da-zi, translated by French-speaking trappers as 'Pierre Jaune'?
In 2010 who introduced the 4-point shot, 35 feet from the basket?
Losses over Asia in the 1960s led to the establishment of the program known as what at a San Diego naval base in 1969?
A craft that visited what was named for Giotto, based on the story that 680 years earlier, the painter depicted it as the Star of Bethlehem?
In World War I, 'Cistern' & 'reservoir' were suggested names for what secret invention, but the British preferred this less clumsy monosyllable?
Until 1806, some German nobles included among their honors the title of 'Elector' for their role in selecting this personage?
In 1904, wearing a harness, actress Nina Boucicault became the first to play what character onstage?
Alphabetically the first German city in encyclopedias, what was also the first one taken by the Allies in World War II?
This Sanskrit word referring to a spoken word or phrase comes from a word for 'to think'?
1917's 'Elements of Trench Warfare' said what Old West invention was 'difficult to destroy' & 'difficult to get through'?
Mimi Reinhard, who never learned to type using more than 2 fingers, produced what in World War II with 1,100 names, including hers?
Poseidon carried off the maiden Theophane & turned her into a ewe; their offspring was the source of what mythical object?
Published in 2011, P.D. James' final novel, 'Death Comes to Pemberley', was a sequel to what novel from 200 years earlier?
5 U.S. states have 6-letter names; only which 2 west of the Mississippi River border each other?
Originally relating to a story of suffering, what word now more commonly refers to strong emotion of any kind?
The 2007 biopic called 'La Môme' in France, meaning 'The Kid', was released in the U.S. under what other French title?
Returning home in 1493, Columbus stopped in the Azores at an island with what name, also something he'd lost off the Haiti coast?
Pskov & Nizhny Novgorod are 2 of the cities that have a fortress called what?
In the 1950s the New York Times said what author 'is writing about all lust' & his lecherous narrator 'is all of us'?
At the winter solstice, the sun is in Sagittarius; it once appeared in what constellation, giving a geographic feature its name?
Mike Post combined the sound of a slamming jail door, an anvil & 100 men stomping on a floor for what television series that debuted in 1990?
Like Sir Thomas More, 3 16th century English queens are buried at what British location?
In 1692 Increase Mather wrote, 'It were better that ten suspected' of these who 'escape, than that one innocent person be condemned'?
The Geography Mnemonic Mimal, sometimes said to be the silhouette of a chef or elf, stands for Minnesota, Iowa, Missouri, and what other 2 states?
What was first sold in 1908, at a price equivalent to about $27,000 today?
The name of what author dead since 2013 now appears on books written by a former U.S. marshal & a former Apache helicopter pilot?
The artwork once known in France as 'la tapisserie de la Reine Mathilde' is better known as what?
In 2022 which pop star became the first woman to have a Billboard Top 10 album in 5 decades starting with the 1980s?
In one 19th century translation, what female classic tale character 'perceived the dawn of day and ceased' speaking nearly 1,000 times?
Ironically, though what company founded in the 1860s is Moore County, Tennessee's largest employer, Moore is a dry county?
After a 1789 event, who wrote, 'My first determination was to seek a supply of…water at Tofoa, & afterwards to sail for Tongataboo'?
Laurence Olivier & Ernest Borgnine were considered for the lead role & Sergio Leone to direct for what film that turned 50 in 2022?
Until a 1903 secession, what country's contiguous territory spanned 2 continents?
Early in her career which foreign-born author translated romance novels into Spanish, often changing the dialogue to make the heroines smarter?
Saying it was stolen by Napoleon, self-styled Italian patriot Vincenzo Peruggia took what in 1911?
Continuing a downward trend, in July 2022 what US body of water was at 27% capacity, its lowest level since 1937 when it was first being filled?
Each morning which goddess began her ride in her chariot across the sky ahead of her brother Sol, or Helios?
Until the Civil War, the Jan. 8 date of what American battle of dubious military importance but big morale value was a national holiday?
Which children's book title character is told 'By the time you are real, most of your hair has been loved off your eyes drop out & you get shabby'?
In a TV reunion over 40 years in the making, Dolly Parton appeared as an angel named Agnes in the final episode of what comedy in 2022?
In an 1847 American poem what character sees her town of Grand-Pré burned, but finally reunites with her beau for a kiss before his death?
In 2001 who published a book called 'Banging Your Head Against a Brick Wall'; in 2002, 'Existencilism'?
The title object of what childrens book 'never looked more beautiful each strand held dozens of bright drops of early morning dew'?
The shouts of excited children at a 1946 holiday parade are said to have inspired what perennial classic song favorite?
Unable to make what candies perfectly round, the confectioner embraced this flawed name for the product?
What country is home to 58 UNESCO World Heritage Sites, more than any other country; the sites include a volcano & a lagoon?
What action movie's last line is 'If this is their idea of Christmas, I gotta be here for New Years'?
Only 3 presidents have married while in office— John Tyler was the first & which one was the last?
Demonstrating the dignity & humanity of Black Americans, who sat for 160 known photographs, the most of any American in the 19th century?
Originally, which Latin 3-word phrase referred to when a doctor or apothecary substituted one medicine for another?
The 1975 premiere of what movie comedy advertised free coconuts for the first thousand in the audience?
A cocktail, an island & a WWII venture originally called 'Development of Substitute Materials' all bear what name?
Which US President was sworn in twice as President within 2 years, first by his father & then later by a former U.S. President?
A 1609 story in which an exiled king of Bulgaria creates a sea palace with his magic may have inspired the plot of what play?
In 2009, during a 20th anniversary celebration, what landmark was called 'an edifice of fear. On Nov. 9, it became a place of joy'?
Among what world capital's nicknames are the 'City of Classical Music' &, possibly in honor of a famous resident from 1860 to 1938, the 'City of Dreams'?
Now meaning someone with nocturnal habits, what catches a sleeping dove in Shakespeare's 'Lucrece'?
The stars on what country's flag represent states, 26 of them; unlike the USA's, its 'federal district' gets its own 27th star?
What father was the only man among the 13 plaintiffs in a US class-action case filed in 1951?
Reversing the story of what heroine she created, childrens author Patricia Maclachlan was born on the prairie but spent much of her life in New England?

View File

@@ -1,4 +1,7 @@
set(TARGET main)
add_executable(${TARGET} main.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ggml ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View File

@@ -1,3 +1,297 @@
# main
TODO
# llama.cpp/example/main
This example program allows you to use various LLaMA language models in an easy and efficient way. It is specifically designed to work with the [llama.cpp](https://github.com/ggerganov/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.
## Table of Contents
1. [Quick Start](#quick-start)
2. [Common Options](#common-options)
3. [Input Prompts](#input-prompts)
4. [Interaction](#interaction)
5. [Context Management](#context-management)
6. [Generation Flags](#generation-flags)
7. [Performance Tuning and Memory Options](#performance-tuning-and-memory-options)
8. [Additional Options](#additional-options)
## Quick Start
To get started right away, run the following command, making sure to use the correct path for the model you have:
#### Unix-based systems (Linux, macOS, etc.):
```bash
./main -m models/7B/ggml-model.bin --prompt "Once upon a time"
```
#### Windows:
```powershell
main.exe -m models\7B\ggml-model.bin --prompt "Once upon a time"
```
For an interactive experience, try this command:
#### Unix-based systems (Linux, macOS, etc.):
```bash
./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " \
'User: Hi
AI: Hello. I am an AI chatbot. Would you like to talk?
User: Sure!
AI: What would you like to talk about?
User:'
```
#### Windows:
```powershell
main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -e --prompt "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:"
```
The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
#### Unix-based systems (Linux, macOS, etc.):
```bash
./main -m models/7B/ggml-model.bin --ignore-eos -n -1 --random-prompt
```
#### Windows:
```powershell
main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt
```
## Common Options
In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
- `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
## Input Prompts
The `main` program provides several ways to interact with the LLaMA models using input prompts:
- `--prompt PROMPT`: Provide a prompt directly as a command-line option.
- `--file FNAME`: Provide a file containing a prompt or multiple prompts.
- `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.)
- `--random-prompt`: Start with a randomized prompt.
## Interaction
The `main` program offers a seamless way to interact with LLaMA models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive`, `--interactive-first`, and `--instruct`.
In interactive mode, users can participate in text generation by injecting their input during the process. Users can press `Ctrl+C` at any time to interject and type their input, followed by pressing `Return` to submit it to the LLaMA model. To submit additional lines without finalizing input, users can end the current line with a backslash (`\`) and continue typing.
### Interaction Options
- `-i, --interactive`: Run the program in interactive mode, allowing users to engage in real-time conversations or provide specific instructions to the model.
- `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation.
- `-ins, --instruct`: Run the program in instruction mode, which is specifically designed to work with Alpaca models that excel in completing tasks based on user instructions.
- `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
### Reverse Prompts
Reverse prompts are a powerful way to create a chat-like experience with a LLaMA model by pausing the text generation when specific text strings are encountered:
- `-r PROMPT, --reverse-prompt PROMPT`: Specify one or multiple reverse prompts to pause text generation and switch to interactive mode. For example, `-r "User:"` can be used to jump back into the conversation whenever it's the user's turn to speak. This helps create a more interactive and conversational experience. However, the reverse prompt doesn't work when it ends with a space.
To overcome this limitation, you can use the `--in-prefix` flag to add a space or any other characters after the reverse prompt.
### In-Prefix
The `--in-prefix` flag is used to add a prefix to your input, primarily, this is used to insert a space after the reverse prompt. Here's an example of how to use the `--in-prefix` flag in conjunction with the `--reverse-prompt` flag:
```sh
./main -r "User:" --in-prefix " "
```
### In-Suffix
The `--in-suffix` flag is used to add a suffix after your input. This is useful for adding an "Assistant:" prompt after the user's input. It's added after the new-line character (`\n`) that's automatically added to the end of the user's input. Here's an example of how to use the `--in-suffix` flag in conjunction with the `--reverse-prompt` flag:
```sh
./main -r "User:" --in-prefix " " --in-suffix "Assistant:"
```
### Instruction Mode
Instruction mode is particularly useful when working with Alpaca models, which are designed to follow user instructions for specific tasks:
- `-ins, --instruct`: Enable instruction mode to leverage the capabilities of Alpaca models in completing tasks based on user-provided instructions.
Technical detail: the user's input is internally prefixed with the reverse prompt (or `### Instruction:` as the default), and followed by `### Response:` (except if you just press Return without any input, to keep generating a longer response).
By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
## Context Management
During text generation, LLaMA models have a limited context size, which means they can only consider a certain number of tokens from the input and generated text. When the context fills up, the model resets internally, potentially losing some information from the beginning of the conversation or instructions. Context management options help maintain continuity and coherence in these situations.
### Context Size
The `--ctx-size` option allows you to set the size of the prompt context used by the LLaMA models during text generation. A larger context size helps the model to better comprehend and generate responses for longer input or conversations.
- `-c N, --ctx-size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.
### Keep Prompt
The `--keep` option allows users to retain the original prompt when the model runs out of context, ensuring a connection to the initial instruction or conversation topic is maintained.
- `--keep N`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
By utilizing context management options like `--ctx-size` and `--keep`, you can maintain a more coherent and consistent interaction with the LLaMA models, ensuring that the generated text remains relevant to the original prompt or conversation.
## Generation Flags
The following options allow you to control the text generation process and fine-tune the diversity, creativity, and quality of the generated text according to your needs. By adjusting these options and experimenting with different combinations of values, you can find the best settings for your specific use case.
### Number of Tokens to Predict
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity).
The `--n-predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit.
It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n-predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.
### Temperature
- `--temp N`: Adjust the randomness of the generated text (default: 0.8).
Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
Example usage: `--temp 0.5`
### Repeat Penalty
- `--repeat-penalty N`: Control the repetition of token sequences in the generated text (default: 1.1).
- `--repeat-last-n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size).
- `--no-penalize-nl`: Disable penalization for newline tokens when applying the repeat penalty.
The `repeat-penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.1.
The `repeat-last-n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx-size`).
Use the `--no-penalize-nl` option to disable newline penalization when applying the repeat penalty. This option is particularly useful for generating chat conversations, dialogues, code, poetry, or any text where newline tokens play a significant role in structure and formatting. Disabling newline penalization helps maintain the natural flow and intended formatting in these specific use cases.
Example usage: `--repeat-penalty 1.15 --repeat-last-n 128 --no-penalize-nl`
### Top-K Sampling
- `--top-k N`: Limit the next token selection to the K most probable tokens (default: 40).
Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top-k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text. The default value is 40.
Example usage: `--top-k 30`
### Top-P Sampling
- `--top-p N`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top-p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. The default value is 0.9.
Example usage: `--top-p 0.95`
### Tail Free Sampling (TFS)
- `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
Tail free sampling (TFS) is a text generation technique that aims to reduce the impact of less likely tokens, which may be less relevant, less coherent, or nonsensical, on the output. The method adjusts the logits (token probabilities) by raising them to the power of the parameter z. A higher value of z (e.g., 2.0) will further suppress less likely tokens from the tail of the distribution, while a value of 1.0 disables the effect of TFS. By setting the parameter z, you can control how much the probabilities of less likely tokens are reduced.
Example usage: `--tfs 2.0`
### Locally Typical Sampling
- `--typical N`: Enable locally typical sampling with parameter p (default: 1.0, 1.0 = disabled).
Locally typical sampling promotes the generation of contextually coherent and diverse text by sampling tokens that are typical or expected based on the surrounding context. By setting the parameter p between 0 and 1, you can control the balance between producing text that is locally coherent and diverse. A value closer to 1 will promote more contextually coherent tokens, while a value closer to 0 will promote more diverse tokens. A value equal to 1 disables locally typical sampling.
Example usage: `--typical 0.9`
### Mirostat Sampling
- `--mirostat N`: Enable Mirostat sampling, controlling perplexity during text generation (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0).
- `--mirostat-lr N`: Set the Mirostat learning rate, parameter eta (default: 0.1).
- `--mirostat-ent N`: Set the Mirostat target entropy, parameter tau (default: 5.0).
Mirostat is an algorithm that actively maintains the quality of generated text within a desired range during text generation. It aims to strike a balance between coherence and diversity, avoiding low-quality output caused by excessive repetition (boredom traps) or incoherence (confusion traps).
The `--mirostat-lr` option sets the Mirostat learning rate (eta). The learning rate influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. The default value is `0.1`.
The `--mirostat-ent` option sets the Mirostat target entropy (tau), which represents the desired perplexity value for the generated text. Adjusting the target entropy allows you to control the balance between coherence and diversity in the generated text. A lower value will result in more focused and coherent text, while a higher value will lead to more diverse and potentially less coherent text. The default value is `5.0`.
Example usage: `--mirostat 2 --mirostat-lr 0.05 --mirostat-ent 3.0`
### Logit Bias
- `-l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS`: Modify the likelihood of a token appearing in the generated text completion.
The logit bias option allows you to manually adjust the likelihood of specific tokens appearing in the generated text. By providing a token ID and a positive or negative bias value, you can increase or decrease the probability of that token being generated.
For example, use `--logit-bias 15043+1` to increase the likelihood of the token 'Hello', or `--logit-bias 15043-1` to decrease its likelihood. Using a value of negative infinity, `--logit-bias 15043-inf` ensures that the token `Hello` is never produced.
A more practical use case might be to prevent the generation of `\code{begin}` and `\code{end}` by setting the `\` token (29905) to negative infinity with `-l 29905-inf`. (This is due to the prevalence of LaTeX codes that show up in LLaMA model inference.)
Example usage: `--logit-bias 29905-inf`
### RNG Seed
- `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1, -1 = random seed).
The RNG seed is used to initialize the random number generator that influences the text generation process. By setting a specific seed value, you can obtain consistent and reproducible results across multiple runs with the same input and settings. This can be helpful for testing, debugging, or comparing the effects of different options on the generated text to see when they diverge. If the seed is set to a value less than 0, a random seed will be used, which will result in different outputs on each run.
## Performance Tuning and Memory Options
These options help improve the performance and memory usage of the LLaMA models. By adjusting these settings, you can fine-tune the model's behavior to better suit your system's capabilities and achieve optimal performance for your specific use case.
### Number of Threads
- `-t N, --threads N`: Set the number of threads to use during computation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has (as opposed to the logical number of cores). Using the correct number of threads can greatly improve performance.
### Mlock
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped. This can improve performance but trades away some of the advantages of memory-mapping by requiring more RAM to run and potentially slowing down load times as the model loads into RAM.
### No Memory Mapping
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. However, if the model is larger than your total amount of RAM or if your system is low on available memory, using mmap might increase the risk of pageouts, negatively impacting performance. Disabling mmap results in slower load times but may reduce pageouts if you're not using `--mlock`. Note that if the model is larger than the total amount of RAM, turning off mmap would prevent the model from loading at all.
### NUMA support
- `--numa`: Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop\_caches' as root.
### Memory Float 32
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement and cached prompt file size but does not appear to increase generation quality in a measurable way. Not recommended.
### Batch Size
- `-b N, --batch-size N`: Set the batch size for prompt processing (default: 512). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.
### Prompt Caching
- `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation.
### Quantization
For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-data--run).
## Additional Options
These options provide extra functionality and customization when running the LLaMA models:
- `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
- `--verbose-prompt`: Print the prompt before generating text.
- `--mtest`: Test the model's functionality by running a series of tests to ensure it's working properly.
- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
- `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.

View File

@@ -1,11 +1,18 @@
// Defines sigaction on msys:
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include "common.h"
#include "llama.h"
#include "build-info.h"
#include <cassert>
#include <cinttypes>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <fstream>
#include <iostream>
#include <string>
@@ -15,21 +22,32 @@
#include <signal.h>
#include <unistd.h>
#elif defined (_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
#include <signal.h>
#endif
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
static console_state con_st;
static llama_context ** g_ctx;
static bool is_interacting = false;
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
void sigint_handler(int signo) {
set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
printf("\n"); // this also force flush stdout.
if (signo == SIGINT) {
if (!is_interacting) {
is_interacting=true;
} else {
console_cleanup(con_st);
printf("\n");
llama_print_timings(*g_ctx);
_exit(130);
}
}
@@ -38,7 +56,6 @@ void sigint_handler(int signo) {
int main(int argc, char ** argv) {
gpt_params params;
params.model = "models/llama-7B/ggml-model.bin";
if (gpt_params_parse(argc, argv, params) == false) {
return 1;
@@ -47,10 +64,9 @@ int main(int argc, char ** argv) {
// save choice to use color for later
// (note for later: this is a slightly awkward choice)
con_st.use_color = params.use_color;
#if defined (_WIN32)
win32_console_init(params.use_color);
#endif
con_st.multiline_input = params.multiline_input;
console_init(con_st);
atexit([]() { console_cleanup(con_st); });
if (params.perplexity) {
printf("\n************\n");
@@ -71,40 +87,35 @@ int main(int argc, char ** argv) {
if (params.n_ctx > 2048) {
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
"expect poor results\n", __func__, params.n_ctx);
} else if (params.n_ctx < 8) {
fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8;
}
if (params.seed <= 0) {
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng);
}
// params.prompt = R"(// this function checks if the number n is prime
//bool is_prime(int n) {)";
llama_init_backend(params.numa);
llama_model * model;
llama_context * ctx;
g_ctx = &ctx;
// load the model
{
auto lparams = llama_context_default_params();
lparams.n_ctx = params.n_ctx;
lparams.n_parts = params.n_parts;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.use_mlock = params.use_mlock;
ctx = llama_init_from_file(params.model.c_str(), lparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return 1;
}
// load the model and apply lora adapter, if any
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
return 1;
}
// print system information
@@ -118,7 +129,7 @@ int main(int argc, char ** argv) {
// uncomment the "used_mem" line in llama.cpp to see the results
if (params.mem_test) {
{
const std::vector<llama_token> tmp(params.n_batch, 0);
const std::vector<llama_token> tmp(params.n_batch, llama_token_bos());
llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
}
@@ -129,15 +140,57 @@ int main(int argc, char ** argv) {
llama_print_timings(ctx);
llama_free(ctx);
llama_free_model(model);
return 0;
}
// Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' ');
// export the cgraph and exit
if (params.export_cgraph) {
llama_eval_export(ctx, "llama.ggml");
llama_free(ctx);
llama_free_model(model);
return 0;
}
std::string path_session = params.path_prompt_cache;
std::vector<llama_token> session_tokens;
if (!path_session.empty()) {
fprintf(stderr, "%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
// fopen to check for existing session
FILE * fp = std::fopen(path_session.c_str(), "rb");
if (fp != NULL) {
std::fclose(fp);
session_tokens.resize(params.n_ctx);
size_t n_token_count_out = 0;
if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
return 1;
}
session_tokens.resize(n_token_count_out);
llama_set_rng_seed(ctx, params.seed);
fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
} else {
fprintf(stderr, "%s: session file does not exist, will create\n", __func__);
}
}
// tokenize the prompt
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
std::vector<llama_token> embd_inp;
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
// Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' ');
embd_inp = ::llama_tokenize(ctx, params.prompt, true);
} else {
embd_inp = session_tokens;
}
const int n_ctx = llama_n_ctx(ctx);
@@ -146,8 +199,37 @@ int main(int argc, char ** argv) {
return 1;
}
// debug message about similarity of saved session, if applicable
size_t n_matching_session_tokens = 0;
if (session_tokens.size()) {
for (llama_token id : session_tokens) {
if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
break;
}
n_matching_session_tokens++;
}
if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
fprintf(stderr, "%s: using full prompt from session file\n", __func__);
} else if (n_matching_session_tokens >= embd_inp.size()) {
fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__);
} else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
__func__, n_matching_session_tokens, embd_inp.size());
} else {
fprintf(stderr, "%s: session file matches %zu / %zu tokens of prompt\n",
__func__, n_matching_session_tokens, embd_inp.size());
}
}
// if we will use the cache for the full prompt without reaching the end of the cache, force
// reevaluation of the last token token to recalculate the cached logits
if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() &&
session_tokens.size() > embd_inp.size()) {
session_tokens.resize(embd_inp.size() - 1);
}
// number of tokens to keep when resetting context
if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size() || params.instruct) {
if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct) {
params.n_keep = (int)embd_inp.size();
}
@@ -157,12 +239,12 @@ int main(int argc, char ** argv) {
// in instruct mode, we inject a prefix and a suffix to each input by the user
if (params.instruct) {
params.interactive_start = true;
params.interactive_first = true;
params.antiprompt.push_back("### Instruction:\n\n");
}
// enable interactive mode if reverse prompt or interactive start is specified
if (params.antiprompt.size() != 0 || params.interactive_start) {
// enable interactive mode if interactive start is specified
if (params.interactive_first) {
params.interactive = true;
}
@@ -194,7 +276,10 @@ int main(int argc, char ** argv) {
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
#elif defined (_WIN32)
signal(SIGINT, sigint_handler);
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
};
SetConsoleCtrlHandler(static_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
#endif
fprintf(stderr, "%s: interactive mode on.\n", __func__);
@@ -208,9 +293,13 @@ int main(int argc, char ** argv) {
if (!params.input_prefix.empty()) {
fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
}
if (!params.input_suffix.empty()) {
fprintf(stderr, "Input suffix: '%s'\n", params.input_suffix.c_str());
}
}
fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n",
params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
fprintf(stderr, "sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
fprintf(stderr, "\n\n");
@@ -219,42 +308,77 @@ int main(int argc, char ** argv) {
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
if (params.interactive) {
const char *control_message;
if (con_st.multiline_input) {
control_message = " - To return control to LLaMa, end your input with '\\'.\n"
" - To return control without starting a new line, end your input with '/'.\n";
} else {
control_message = " - Press Return to return control to LLaMa.\n"
" - To return control without starting a new line, end your input with '/'.\n"
" - If you want to submit another line, end your input with '\\'.\n";
}
fprintf(stderr, "== Running in interactive mode. ==\n"
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
" - Press Ctrl+C to interject at any time.\n"
#endif
" - Press Return to return control to LLaMa.\n"
" - If you want to submit another line, end your input in '\\'.\n\n");
is_interacting = params.interactive_start;
"%s\n", control_message);
is_interacting = params.interactive_first;
}
bool is_antiprompt = false;
bool input_noecho = false;
bool is_antiprompt = false;
bool input_echo = true;
bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < embd_inp.size();
int n_past = 0;
int n_remain = params.n_predict;
int n_consumed = 0;
int n_past = 0;
int n_remain = params.n_predict;
int n_consumed = 0;
int n_session_consumed = 0;
// the first thing we will do is to output the prompt, so set color accordingly
set_console_color(con_st, CONSOLE_COLOR_PROMPT);
console_set_color(con_st, CONSOLE_COLOR_PROMPT);
std::vector<llama_token> embd;
while (n_remain != 0 || params.interactive) {
// do one empty run to warm up the model
{
const std::vector<llama_token> tmp = { llama_token_bos(), };
llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
llama_reset_timings(ctx);
}
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
// predict
if (embd.size() > 0) {
// Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
// --prompt or --file which uses the same value.
auto max_embd_size = n_ctx - 4;
// Ensure the input doesn't exceed the context size by truncating embd if necessary.
if ((int)embd.size() > max_embd_size) {
auto skipped_tokens = embd.size() - max_embd_size;
console_set_color(con_st, CONSOLE_COLOR_ERROR);
printf("<<input too long: skipped %zu token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
fflush(stdout);
embd.resize(max_embd_size);
}
// infinite text generation via context swapping
// if we run out of context:
// - take the n_keep first tokens from the original prompt (via n_past)
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in a batch
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
if (n_past + (int) embd.size() > n_ctx) {
const int n_left = n_past - params.n_keep;
n_past = params.n_keep;
// always keep the first token - BOS
n_past = std::max(1, params.n_keep);
// insert n_left/2 tokens at the start of embd from last_n_tokens
embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
// stop saving session if we run out of context
path_session.clear();
//printf("\n---\n");
//printf("resetting: '");
//for (int i = 0; i < (int) embd.size(); i++) {
@@ -264,34 +388,128 @@ int main(int argc, char ** argv) {
//printf("\n---\n");
}
if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
// try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
if (n_session_consumed < (int) session_tokens.size()) {
size_t i = 0;
for ( ; i < embd.size(); i++) {
if (embd[i] != session_tokens[n_session_consumed]) {
session_tokens.resize(n_session_consumed);
break;
}
n_past++;
n_session_consumed++;
if (n_session_consumed >= (int) session_tokens.size()) {
++i;
break;
}
}
if (i > 0) {
embd.erase(embd.begin(), embd.begin() + i);
}
}
// evaluate tokens in batches
// embd is typically prepared beforehand to fit within a batch, but not always
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
int n_eval = (int) embd.size() - i;
if (n_eval > params.n_batch) {
n_eval = params.n_batch;
}
if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
}
n_past += n_eval;
}
if (embd.size() > 0 && !path_session.empty()) {
session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
n_session_consumed = session_tokens.size();
}
}
n_past += embd.size();
embd.clear();
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
// out of user input, sample next token
const int32_t top_k = params.top_k;
const float top_p = params.top_p;
const float temp = params.temp;
const float repeat_penalty = params.repeat_penalty;
const float temp = params.temp;
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
const float top_p = params.top_p;
const float tfs_z = params.tfs_z;
const float typical_p = params.typical_p;
const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
const float repeat_penalty = params.repeat_penalty;
const float alpha_presence = params.presence_penalty;
const float alpha_frequency = params.frequency_penalty;
const int mirostat = params.mirostat;
const float mirostat_tau = params.mirostat_tau;
const float mirostat_eta = params.mirostat_eta;
const bool penalize_nl = params.penalize_nl;
// optionally save the session on first sample (for faster prompt loading next time)
if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
need_to_save_session = false;
llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
}
llama_token id = 0;
{
auto logits = llama_get_logits(ctx);
auto logits = llama_get_logits(ctx);
auto n_vocab = llama_n_vocab(ctx);
if (params.ignore_eos) {
logits[llama_token_eos()] = 0;
// Apply params.logit_bias map
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
logits[it->first] += it->second;
}
id = llama_sample_top_p_top_k(ctx,
last_n_tokens.data() + n_ctx - params.repeat_last_n,
params.repeat_last_n, top_k, top_p, temp, repeat_penalty);
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
// Apply penalties
float nl_logit = logits[llama_token_nl()];
auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
llama_sample_repetition_penalty(ctx, &candidates_p,
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
last_n_repeat, repeat_penalty);
llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
last_n_repeat, alpha_frequency, alpha_presence);
if (!penalize_nl) {
logits[llama_token_nl()] = nl_logit;
}
if (temp <= 0) {
// Greedy sampling
id = llama_sample_token_greedy(ctx, &candidates_p);
} else {
if (mirostat == 1) {
static float mirostat_mu = 2.0f * mirostat_tau;
const int mirostat_m = 100;
llama_sample_temperature(ctx, &candidates_p, temp);
id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
} else if (mirostat == 2) {
static float mirostat_mu = 2.0f * mirostat_tau;
llama_sample_temperature(ctx, &candidates_p, temp);
id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
} else {
// Temperature sampling
llama_sample_top_k(ctx, &candidates_p, top_k, 1);
llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
llama_sample_typical(ctx, &candidates_p, typical_p, 1);
llama_sample_top_p(ctx, &candidates_p, top_p, 1);
llama_sample_temperature(ctx, &candidates_p, temp);
id = llama_sample_token(ctx, &candidates_p);
}
}
// printf("`%d`", candidates_p.size);
last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(id);
@@ -311,7 +529,7 @@ int main(int argc, char ** argv) {
embd.push_back(id);
// echo this to console
input_noecho = false;
input_echo = true;
// decrement remaining sampling budget
--n_remain;
@@ -329,20 +547,19 @@ int main(int argc, char ** argv) {
}
// display text
if (!input_noecho) {
if (input_echo) {
for (auto id : embd) {
printf("%s", llama_token_to_str(ctx, id));
}
fflush(stdout);
}
// reset color to default if we there is no pending user input
if (!input_noecho && (int)embd_inp.size() == n_consumed) {
set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
if (input_echo && (int)embd_inp.size() == n_consumed) {
console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
}
// in interactive mode, and not currently processing queued inputs;
// check if we should prompt the user for more
if (params.interactive && (int) embd_inp.size() <= n_consumed) {
// if not currently processing queued inputs;
if ((int) embd_inp.size() <= n_consumed) {
// check for reverse prompt
if (params.antiprompt.size()) {
@@ -353,11 +570,20 @@ int main(int argc, char ** argv) {
is_antiprompt = false;
// Check if each of the reverse prompts appears at the end of the output.
// If we're not running interactively, the reverse prompt might be tokenized with some following characters
// so we'll compensate for that by widening the search window a bit.
for (std::string & antiprompt : params.antiprompt) {
if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
is_interacting = true;
size_t extra_padding = params.interactive ? 0 : 2;
size_t search_start_pos = last_output.length() > static_cast<size_t>(antiprompt.length() + extra_padding)
? last_output.length() - static_cast<size_t>(antiprompt.length() + extra_padding)
: 0;
if (last_output.find(antiprompt.c_str(), search_start_pos) != std::string::npos) {
if (params.interactive) {
is_interacting = true;
console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
}
is_antiprompt = true;
set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
fflush(stdout);
break;
}
@@ -365,9 +591,6 @@ int main(int argc, char ** argv) {
}
if (n_past > 0 && is_interacting) {
// potentially set color to indicate we are taking user input
set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
if (params.instruct) {
printf("\n> ");
}
@@ -381,24 +604,21 @@ int main(int argc, char ** argv) {
std::string line;
bool another_line = true;
do {
if (!std::getline(std::cin, line)) {
// input stream is bad or EOF received
return 0;
}
if (line.empty() || line.back() != '\\') {
another_line = false;
} else {
line.pop_back(); // Remove the continue character
}
buffer += line + '\n'; // Append the line to the result
another_line = console_readline(con_st, line);
buffer += line;
} while (another_line);
// done taking input, reset color
set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
// Add tokens to embd only if the input buffer is non-empty
// Entering a empty line lets the user pass control back
if (buffer.length() > 1) {
// append input suffix if any
if (!params.input_suffix.empty()) {
buffer += params.input_suffix;
printf("%s", params.input_suffix.c_str());
}
// instruct mode: insert instruction prefix
if (params.instruct && !is_antiprompt) {
@@ -417,7 +637,7 @@ int main(int argc, char ** argv) {
n_remain -= line_inp.size();
}
input_noecho = true; // do not echo this again
input_echo = false; // do not echo this again
}
if (n_past > 0) {
@@ -426,7 +646,7 @@ int main(int argc, char ** argv) {
}
// end of text token
if (embd.back() == llama_token_eos()) {
if (!embd.empty() && embd.back() == llama_token_eos()) {
if (params.instruct) {
is_interacting = true;
} else {
@@ -442,14 +662,14 @@ int main(int argc, char ** argv) {
}
}
#if defined (_WIN32)
signal(SIGINT, SIG_DFL);
#endif
if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
fprintf(stderr, "\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
}
llama_print_timings(ctx);
llama_free(ctx);
set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
llama_free_model(model);
return 0;
}

View File

@@ -0,0 +1,3 @@
set(TEST_TARGET metal)
add_executable(${TEST_TARGET} metal.cpp)
target_link_libraries(${TEST_TARGET} PRIVATE ggml)

104
examples/metal/metal.cpp Normal file
View File

@@ -0,0 +1,104 @@
// Evaluate a statically exported ggml computation graph with Metal
//
// - First, export a LLaMA graph:
//
// $ ./bin/main -m ../models/7B/ggml-model-q4_0.bin --export
//
// - Run this tool to evaluate the exported graph:
//
// $ ./bin/metal llama.ggml
//
// The purpose of this tool is mostly for debugging and demonstration purposes.
// The main limitation of exporting computation graphs is that their sizes are static which often
// can be a problem for real-world applications.
//
#include "ggml.h"
#include "ggml-metal.h"
#include <cstdio>
#include <cstring>
#include <cstdlib>
int main(int argc, char ** argv) {
ggml_time_init();
if (argc != 2) {
fprintf(stderr, "Usage: %s llama.ggml\n", argv[0]);
return -1;
}
const char * fname_cgraph = argv[1];
// load the compute graph
struct ggml_context * ctx_data = NULL;
struct ggml_context * ctx_eval = NULL;
struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
gf.n_threads = 1;
// this allocates all Metal resources and memory buffers
auto * ctx_metal = ggml_metal_init();
const size_t max_size_data = ggml_get_max_tensor_size(ctx_data);
const size_t max_size_eval = ggml_get_max_tensor_size(ctx_eval);
ggml_metal_add_buffer(ctx_metal, "data", ggml_get_mem_buffer(ctx_data), ggml_get_mem_size(ctx_data), max_size_data);
ggml_metal_add_buffer(ctx_metal, "eval", ggml_get_mem_buffer(ctx_eval), ggml_get_mem_size(ctx_eval), max_size_eval);
// main
{
struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "embd");
*(int32_t *) input->data = 1; // BOS
ggml_metal_set_tensor(ctx_metal, input);
// warmup
ggml_metal_graph_compute(ctx_metal, &gf);
const int n_iter = 16;
const int64_t t0 = ggml_time_us();
// the actual inference happens here
for (int i = 0; i < n_iter; ++i) {
ggml_metal_graph_compute(ctx_metal, &gf);
}
const int64_t t1 = ggml_time_us();
printf("time: %.2f ms, %.2f ms/tok\n", (t1 - t0) / 1000.0, (t1 - t0) / 1000.0 / n_iter);
}
// debug output
{
struct ggml_tensor * logits = gf.nodes[gf.n_nodes - 1];
ggml_metal_get_tensor(ctx_metal, logits);
float * ptr = (float *) ggml_get_data(logits);
printf("logits: ");
for (int i = 0; i < 10; i++) {
printf("%8.4f ", ptr[i]);
}
printf("\n");
int imax = 0;
double sum = 0.0;
double vmax = -1e9;
for (int i = 0; i < 32000; i++) {
sum += (double) ptr[i];
if (ptr[i] > vmax) {
vmax = ptr[i];
imax = i;
}
}
printf("sum: %f, imax = %d, vmax = %f\n", sum, imax, vmax);
}
ggml_metal_free(ctx_metal);
ggml_free(ctx_data);
ggml_free(ctx_eval);
return 0;
}

View File

@@ -1,4 +1,7 @@
set(TARGET perplexity)
add_executable(${TARGET} perplexity.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ggml ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View File

@@ -1,3 +1,3 @@
# perplexity
TODO
# perplexity
TODO

View File

@@ -1,7 +1,13 @@
#include "common.h"
#include "llama.h"
#include "build-info.h"
#include <cmath>
#include <ctime>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
std::vector<float> softmax(const std::vector<float>& logits) {
std::vector<float> probs(logits.size());
@@ -23,33 +29,68 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
// Output: `perplexity: 13.5106 [114/114]`
// BOS tokens will be added for each chunk before eval
auto tokens = ::llama_tokenize(ctx, params.prompt, true);
int count = 0;
int seq_count = tokens.size() / params.n_ctx;
int count = 0;
const int n_chunk = tokens.size() / params.n_ctx;
const int n_vocab = llama_n_vocab(ctx);
const int n_batch = params.n_batch;
double nll = 0.0;
fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count);
for (int i = 0; i < n_chunk; ++i) {
const int start = i * params.n_ctx;
const int end = start + params.n_ctx;
for (int i = 0; i < seq_count; ++i) {
int start = i * params.n_ctx;
int end = start + params.n_ctx - 1; // TODO: this is not optimal, e.g. it makes the batch 511 instead of 512
// it is better to always be power of 2 for better performance
std::vector<llama_token> embd(tokens.begin() + start, tokens.begin() + end);
auto start_t = std::chrono::high_resolution_clock::now();
if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return;
const int num_batches = (params.n_ctx + n_batch - 1) / n_batch;
std::vector<float> logits;
const auto t_start = std::chrono::high_resolution_clock::now();
for (int j = 0; j < num_batches; ++j) {
const int batch_start = start + j * n_batch;
const int batch_size = std::min(end - batch_start, n_batch);
// save original token and restore it after eval
const auto token_org = tokens[batch_start];
// add BOS token for the first batch of each chunk
if (j == 0) {
tokens[batch_start] = llama_token_bos();
}
if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return;
}
// restore the original token in case it was set to BOS
tokens[batch_start] = token_org;
const auto batch_logits = llama_get_logits(ctx);
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
}
auto end_t = std::chrono::high_resolution_clock::now();
const auto t_end = std::chrono::high_resolution_clock::now();
if (i == 0) {
const float seconds = std::chrono::duration<float>(end_t - start_t).count();
printf("%.2f seconds per pass - ETA %.2f hours\n", seconds, (seconds * seq_count) / (60.0*60.0));
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
int total_seconds = (int)(t_total * n_chunk);
if (total_seconds >= 60*60) {
fprintf(stderr, "%d hours ", total_seconds / (60*60));
total_seconds = total_seconds % (60*60);
}
fprintf(stderr, "%d minutes\n", total_seconds / 60);
}
// We get the logits for all the tokens in the context window (params.n_ctx)
// from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity,
// calculate the perplexity over the last half the window (so the model always has
// calculate the perplexity over the last half of the window (so the model always has
// some context to predict the token).
//
// We rely on the fact that attention in the forward pass only looks at previous
@@ -59,15 +100,14 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
// Example, we have a context window of 512, we will compute perplexity for each of the
// last 256 tokens. Then, we split the input up into context window size chunks to
// process the entire prompt.
auto logits = llama_get_logits(ctx);
for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) {
for (int j = std::min(512, params.n_ctx / 2); j < params.n_ctx - 1; ++j) {
// Calculate probability of next token, given the previous ones.
int n_vocab = llama_n_vocab(ctx);
std::vector<float> tok_logits(
logits + j * n_vocab,
logits + (j + 1) * n_vocab);
const std::vector<float> tok_logits(
logits.begin() + (j + 0) * n_vocab,
logits.begin() + (j + 1) * n_vocab);
const float prob = softmax(tok_logits)[tokens[start + j + 1]];
nll += -std::log(prob);
++count;
}
@@ -80,50 +120,43 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
int main(int argc, char ** argv) {
gpt_params params;
params.model = "models/llama-7B/ggml-model.bin";
params.n_batch = 512;
if (gpt_params_parse(argc, argv, params) == false) {
return 1;
}
params.perplexity = true;
params.n_batch = std::min(params.n_batch, params.n_ctx);
if (params.n_ctx > 2048) {
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
"expect poor results\n", __func__, params.n_ctx);
}
if (params.seed <= 0) {
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng);
}
llama_init_backend(params.numa);
llama_model * model;
llama_context * ctx;
// load the model
{
auto lparams = llama_context_default_params();
lparams.n_ctx = params.n_ctx;
lparams.n_parts = params.n_parts;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.logits_all = params.perplexity;
lparams.use_mlock = params.use_mlock;
lparams.embedding = params.embedding;
ctx = llama_init_from_file(params.model.c_str(), lparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return 1;
}
// load the model and apply lora adapter, if any
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
return 1;
}
// print system information
@@ -137,6 +170,7 @@ int main(int argc, char ** argv) {
llama_print_timings(ctx);
llama_free(ctx);
llama_free_model(model);
return 0;
}

View File

@@ -0,0 +1,4 @@
set(TARGET quantize-stats)
add_executable(${TARGET} quantize-stats.cpp)
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@@ -0,0 +1,438 @@
#include "ggml.h"
#include "build-info.h"
#define LLAMA_API_INTERNAL
#include "llama.h"
#include <algorithm>
#include <cassert>
#include <cinttypes>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <map>
#include <numeric>
#include <regex>
#include <string>
#include <unordered_map>
#include <vector>
#include <thread>
#include <mutex>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
struct quantize_stats_params {
std::string model = "models/7B/ggml-model-f16.bin";
bool verbose = false;
bool per_layer_stats = false;
bool print_histogram = false;
bool reference = false;
std::vector<std::string> include_layers;
std::vector<std::string> exclude_layers;
std::vector<enum ggml_type> include_types;
};
const size_t HISTOGRAM_BUCKETS = 150;
const double HISTOGRAM_RANGE = 0.03;
struct error_stats {
size_t num_samples;
double total_error;
double max_error;
uint64_t error_histogram[HISTOGRAM_BUCKETS];
};
void quantize_stats_print_usage(int /*argc*/, char ** argv) {
quantize_stats_params params;
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help show this help message and exit\n");
fprintf(stderr, " -m FNAME, --model FNAME\n");
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
fprintf(stderr, " -r, --reference\n");
fprintf(stderr, " use reference implementation (default: false)\n");
fprintf(stderr, " -v, --verbose\n");
fprintf(stderr, " verbose output (default: false)\n");
fprintf(stderr, " -p, --per-layer-stats\n");
fprintf(stderr, " print stats per layer (default: false)\n");
fprintf(stderr, " --histogram\n");
fprintf(stderr, " print error histogram (default: false)\n");
fprintf(stderr, " -l LAYER, --include-layer LAYER\n");
fprintf(stderr, " only test layers matching pattern\n");
fprintf(stderr, " -L LAYER, --exclude-layer LAYER\n");
fprintf(stderr, " exclude layers matching pattern\n");
fprintf(stderr, " -t TYPE, --type TYPE\n");
fprintf(stderr, " only test given type (q4_0, q4_1)\n");
fprintf(stderr, "\n");
}
// Check if a layer is included/excluded by command line
bool layer_included(const quantize_stats_params params, const std::string & layer) {
for (const auto& excluded : params.exclude_layers) {
if (std::regex_search(layer, std::regex(excluded))) {
return false;
}
}
for (const auto& included : params.include_layers) {
if (std::regex_search(layer, std::regex(included))) {
return true;
}
}
return params.include_layers.empty();
}
// Update error statistics given vectors with the before/after result of quantization
void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) {
for (int64_t i = 0; i < nelements; i++) {
double diff = input[i] - output[i];
stats.total_error += diff * diff;
stats.max_error = fmax(fabs(diff), stats.max_error);
stats.error_histogram[std::max(std::min((size_t) floor(fabs(diff) / HISTOGRAM_RANGE * HISTOGRAM_BUCKETS), HISTOGRAM_BUCKETS-1), (size_t) 0)]++;
}
stats.num_samples += nelements;
}
void combine_error_stats(error_stats & into, const error_stats & from) {
into.num_samples += from.num_samples;
into.total_error += from.total_error;
if (from.max_error > into.max_error) into.max_error = from.max_error;
for (size_t i=0; i<HISTOGRAM_BUCKETS; ++i) into.error_histogram[i] += from.error_histogram[i];
}
double find_quantile(const error_stats & stats, double quantile) {
double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0);
double accum = 0;
for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
accum += stats.error_histogram[i];
if (accum >= sum*quantile) {
return (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
}
}
return INFINITY;
}
void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
double rmse = sqrt(stats.total_error / (double) stats.num_samples);
double median = find_quantile(stats, .5);
double pct95 = find_quantile(stats, .95);
printf("%-50s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n", name.c_str(), rmse, stats.max_error, pct95, median);
if (print_histogram) {
printf("Error distribution:\n");
for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
double lower = i * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
double upper = (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
if (i == HISTOGRAM_BUCKETS -1) upper = INFINITY;
printf("[%3.4f, %3.4f): %11" PRIu64 "\n", lower, upper, stats.error_histogram[i]);
}
}
}
// copied from ggml.h - verify that we can access this as a flat array
static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return
tensor->nb[0] == ggml_type_size(tensor->type) &&
tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
}
void test_roundtrip_on_chunk(
const ggml_tensor * layer,
int64_t offset,
int64_t chunk_size,
const quantize_fns_t & qfns,
bool use_reference,
float * input_scratch,
char * quantized_scratch,
float * output_scratch,
error_stats & stats) {
if (layer->type == GGML_TYPE_F16) {
for (int i = 0; i < chunk_size; i++) {
input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
}
} else {
input_scratch = ggml_get_data_f32(layer) + offset;
}
if (use_reference) {
qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size);
} else {
qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size);
}
qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size);
update_error_stats(chunk_size, input_scratch, output_scratch, stats);
}
// Run quantization function for a single layer and update error stats
void test_roundtrip_on_layer(
std::string & name,
bool print_layer_stats,
const quantize_fns_t & qfns,
bool use_reference,
const ggml_tensor * layer,
std::vector<float> & input_scratch,
std::vector<char> & quantized_scratch,
std::vector<float> & output_scratch,
error_stats & total_error,
int max_thread = 0) {
assert(tensor_is_contiguous(layer));
error_stats layer_error {};
uint64_t nelements = ggml_nelements(layer);
float* input_scratch_ptr = nullptr;
if (layer->type == GGML_TYPE_F16) {
if (input_scratch.size() < nelements) input_scratch.resize(nelements);
input_scratch_ptr = input_scratch.data();
}
if (quantized_scratch.size() < 4*nelements) quantized_scratch.resize(4*nelements);
if (output_scratch.size() < nelements) output_scratch.resize(nelements);
if (max_thread < 1) max_thread = std::thread::hardware_concurrency();
int chunk_size = 32*512;
int num_chunks = (nelements + chunk_size - 1)/chunk_size;
if (num_chunks < 2 || max_thread < 2) {
test_roundtrip_on_chunk(layer, 0, nelements, qfns, use_reference, input_scratch_ptr, quantized_scratch.data(),
output_scratch.data(), print_layer_stats ? layer_error : total_error);
} else {
auto & stats = print_layer_stats ? layer_error : total_error;
std::mutex mutex;
uint64_t counter = 0;
auto compute = [&mutex, &counter, &stats, &qfns, nelements, layer, use_reference, input_scratch_ptr,
&quantized_scratch, &output_scratch, chunk_size] () {
error_stats local_stats {};
while (true) {
std::unique_lock<std::mutex> lock(mutex);
uint64_t offset = counter; counter += chunk_size;
if (offset >= nelements) {
combine_error_stats(stats, local_stats);
break;
}
lock.unlock();
uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset;
test_roundtrip_on_chunk(layer, offset, chunk, qfns, use_reference, input_scratch_ptr + offset,
quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats);
}
};
int nthread = std::min(num_chunks, max_thread);
std::vector<std::thread> workers(nthread-1);
for (auto& w : workers) w = std::thread(compute);
compute();
for (auto& w : workers) w.join();
}
if (print_layer_stats) {
print_error_stats(name, layer_error, false);
combine_error_stats(total_error, layer_error);
}
}
int main(int argc, char ** argv) {
ggml_time_init();
quantize_stats_params params;
// read command line
int max_thread = 0;
bool invalid_param = false;
std::string arg;
for (int i = 1; i < argc; i++) {
arg = argv[i];
if (arg == "-h" || arg == "--help") {
quantize_stats_print_usage(argc, argv);
exit(0);
} else if (arg == "-r" || arg == "--reference") {
params.reference = true;
} else if (arg == "-v") {
params.verbose = true;
} else if (arg == "-p" || arg == "--per-layer-stats") {
params.per_layer_stats = true;
} else if (arg == "--histogram") {
params.print_histogram = true;
} else if (arg == "-m" || arg == "--model") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.model = argv[i];
} else if (arg == "-l" || arg == "--include-layer") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.include_layers.push_back(argv[i]);
} else if (arg == "-L" || arg == "--exclude-layer") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.exclude_layers.push_back(argv[i]);
} else if (arg == "-t" || arg == "--type") {
if (++i >= argc) {
invalid_param = true;
break;
}
int j;
for (j = 0; j < GGML_TYPE_COUNT; ++j) {
const auto * name = ggml_type_name((ggml_type) j);
if (name && strcmp(argv[i], name) == 0) break;
}
if (j < GGML_TYPE_COUNT) {
params.include_types.push_back((ggml_type) j);
} else {
fprintf(stderr, "error: %s not in list of types\n", argv[i]);
invalid_param = true;
}
} else if (arg == "-n" || arg == "--num-threads") {
if (++i >= argc) {
invalid_param = true;
break;
}
max_thread = atoi(argv[i]);
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
quantize_stats_print_usage(argc, argv);
return 1;
}
}
if (invalid_param) {
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
quantize_stats_print_usage(argc, argv);
return 1;
}
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
// load the model
fprintf(stderr, "Loading model\n");
const int64_t t_main_start_us = ggml_time_us();
llama_model * model;
llama_context * ctx;
{
auto lparams = llama_context_default_params();
lparams.n_ctx = 256;
lparams.seed = 1;
lparams.f16_kv = false;
lparams.use_mlock = false;
model = llama_load_model_from_file(params.model.c_str(), lparams);
if (model == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return 1;
}
ctx = llama_new_context_with_model(model, lparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
llama_free_model(model);
return 1;
}
}
const auto &tensors = llama_internal_get_tensor_map(ctx);
// check layer tensors
int included_layers = 0;
int64_t max_nelements = 0;
bool is_f16 = false;
for (const auto& kv_tensor : tensors) {
if (!layer_included(params, kv_tensor.first)) {
continue;
}
if (params.verbose) {
printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), ggml_type_name(kv_tensor.second->type), ggml_nelements(kv_tensor.second));
}
if (kv_tensor.second->type == GGML_TYPE_F16) {
is_f16 = true;
} else if (kv_tensor.second->type != GGML_TYPE_F32) {
fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
"this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
llama_free(ctx);
llama_free_model(model);
return 1;
}
included_layers++;
max_nelements = std::max(max_nelements, ggml_nelements(kv_tensor.second));
}
if (is_f16) {
printf("note: source model is f16\n");
}
printf("testing %d layers with max size %" PRId64 "\n", included_layers, max_nelements);
// allocate scratch space
std::vector<float> input_scratch;
std::vector<char> quantized_scratch;
std::vector<float> output_scratch;
// loop throught quantization types
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
const ggml_type type = (ggml_type) i;
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
continue;
}
quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
if (params.verbose) {
printf("testing %s ...\n", ggml_type_name(type));
}
error_stats global_stats {};
for (const auto& kv_tensor : tensors) {
if (!layer_included(params, kv_tensor.first)) {
continue;
}
if (params.verbose) {
printf(" %s ...\n", kv_tensor.first.c_str());
}
std::string layer_name { ggml_type_name(type) };
layer_name += "::" + kv_tensor.first;
test_roundtrip_on_layer(
layer_name,
params.per_layer_stats,
qfns,
params.reference,
kv_tensor.second,
input_scratch,
quantized_scratch,
output_scratch,
global_stats,
max_thread
);
}
print_error_stats(ggml_type_name(type), global_stats, params.print_histogram);
}
}
llama_free(ctx);
llama_free_model(model);
// report timing
{
const int64_t t_main_end_us = ggml_time_us();
printf("\n");
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
}
return 0;
}

View File

@@ -1,4 +1,7 @@
set(TARGET quantize)
add_executable(${TARGET} quantize.cpp)
target_link_libraries(${TARGET} PRIVATE llama ggml ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View File

@@ -1,53 +1,256 @@
#include "ggml.h"
#include "build-info.h"
#include "llama.h"
#include <cstdio>
#include <cstring>
#include <vector>
#include <string>
// usage:
// ./llama-quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
//
int main(int argc, char ** argv) {
ggml_time_init();
struct quant_option {
std::string name;
llama_ftype ftype;
std::string desc;
};
if (argc != 4) {
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
fprintf(stderr, " type = 2 - q4_0\n");
fprintf(stderr, " type = 3 - q4_1\n");
return 1;
}
// needed to initialize f16 tables
static const std::vector<struct quant_option> QUANT_OPTIONS = {
{
struct ggml_init_params params = { 0, NULL };
struct ggml_context * ctx = ggml_init(params);
ggml_free(ctx);
"Q4_0",
LLAMA_FTYPE_MOSTLY_Q4_0,
" 3.50G, +0.2499 ppl @ 7B - small, very high quality loss - legacy, prefer using Q3_K_M",
},
{
"Q4_1",
LLAMA_FTYPE_MOSTLY_Q4_1,
" 3.90G, +0.1846 ppl @ 7B - small, substantial quality loss - legacy, prefer using Q3_K_L",
},
{
"Q5_0",
LLAMA_FTYPE_MOSTLY_Q5_0,
" 4.30G, +0.0796 ppl @ 7B - medium, balanced quality - legacy, prefer using Q4_K_M",
},
{
"Q5_1",
LLAMA_FTYPE_MOSTLY_Q5_1,
" 4.70G, +0.0415 ppl @ 7B - medium, low quality loss - legacy, prefer using Q5_K_M",
},
#ifdef GGML_USE_K_QUANTS
{
"Q2_K",
LLAMA_FTYPE_MOSTLY_Q2_K,
" 2.67G, +0.8698 ppl @ 7B - smallest, extreme quality loss - not recommended",
},
{
"Q3_K",
LLAMA_FTYPE_MOSTLY_Q3_K_M,
"alias for Q3_K_M"
},
{
"Q3_K_S",
LLAMA_FTYPE_MOSTLY_Q3_K_S,
" 2.75G, +0.5505 ppl @ 7B - very small, very high quality loss",
},
{
"Q3_K_M",
LLAMA_FTYPE_MOSTLY_Q3_K_M,
" 3.06G, +0.2437 ppl @ 7B - very small, very high quality loss",
},
{
"Q3_K_L",
LLAMA_FTYPE_MOSTLY_Q3_K_L,
" 3.35G, +0.1803 ppl @ 7B - small, substantial quality loss",
},
{
"Q4_K",
LLAMA_FTYPE_MOSTLY_Q4_K_M,
"alias for Q4_K_M",
},
{
"Q4_K_S",
LLAMA_FTYPE_MOSTLY_Q4_K_S,
" 3.56G, +0.1149 ppl @ 7B - small, significant quality loss",
},
{
"Q4_K_M",
LLAMA_FTYPE_MOSTLY_Q4_K_M,
" 3.80G, +0.0535 ppl @ 7B - medium, balanced quality - *recommended*",
},
{
"Q5_K",
LLAMA_FTYPE_MOSTLY_Q5_K_M,
"alias for Q5_K_M",
},
{
"Q5_K_S",
LLAMA_FTYPE_MOSTLY_Q5_K_S,
" 4.33G, +0.0353 ppl @ 7B - large, low quality loss - *recommended*",
},
{
"Q5_K_M",
LLAMA_FTYPE_MOSTLY_Q5_K_M,
" 4.45G, +0.0142 ppl @ 7B - large, very low quality loss - *recommended*",
},
{
"Q6_K",
LLAMA_FTYPE_MOSTLY_Q6_K,
" 5.15G, +0.0044 ppl @ 7B - very large, extremely low quality loss",
},
#endif
{
"Q8_0",
LLAMA_FTYPE_MOSTLY_Q8_0,
" 6.70G, +0.0004 ppl @ 7B - very large, extremely low quality loss - not recommended",
},
{
"F16",
LLAMA_FTYPE_MOSTLY_F16,
"13.00G @ 7B - extremely large, virtually no quality loss - not recommended",
},
{
"F32",
LLAMA_FTYPE_ALL_F32,
"26.00G @ 7B - absolutely huge, lossless - not recommended",
},
};
bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
std::string ftype_str;
for (auto ch : ftype_str_in) {
ftype_str.push_back(std::toupper(ch));
}
for (auto & it : QUANT_OPTIONS) {
if (it.name == ftype_str) {
ftype = it.ftype;
ftype_str_out = it.name;
return true;
}
}
try {
int ftype_int = std::stoi(ftype_str);
for (auto & it : QUANT_OPTIONS) {
if (it.ftype == ftype_int) {
ftype = it.ftype;
ftype_str_out = it.name;
return true;
}
}
}
catch (...) {
// stoi failed
}
return false;
}
// usage:
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
//
void usage(const char * executable) {
fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n\n", executable);
fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
fprintf(stderr, "\nAllowed quantization types:\n");
for (auto & it : QUANT_OPTIONS) {
printf(" %2d or %-6s : %s\n", it.ftype, it.name.c_str(), it.desc.c_str());
}
exit(1);
}
int main(int argc, char ** argv) {
if (argc < 3) {
usage(argv[0]);
}
const std::string fname_inp = argv[1];
const std::string fname_out = argv[2];
llama_model_quantize_params params = llama_model_quantize_default_params();
const int itype = atoi(argv[3]);
int arg_idx = 1;
const int64_t t_main_start_us = ggml_time_us();
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
params.quantize_output_tensor = false;
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
params.allow_requantize = true;
} else {
usage(argv[0]);
}
}
if (argc - arg_idx < 3) {
usage(argv[0]);
}
llama_init_backend(false);
// parse command line arguments
const std::string fname_inp = argv[arg_idx];
arg_idx++;
std::string fname_out;
std::string ftype_str;
if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
std::string fpath;
const size_t pos = fname_inp.find_last_of('/');
if (pos != std::string::npos) {
fpath = fname_inp.substr(0, pos + 1);
}
// export as [inp path]/ggml-model-[ftype].bin
fname_out = fpath + "ggml-model-" + ftype_str + ".bin";
arg_idx++;
}
else {
fname_out = argv[arg_idx];
arg_idx++;
if (argc <= arg_idx) {
fprintf(stderr, "%s: missing ftype\n", __func__);
return 1;
}
if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
return 1;
}
arg_idx++;
}
// parse nthreads
if (argc > arg_idx) {
try {
params.nthread = std::stoi(argv[arg_idx]);
}
catch (const std::exception & e) {
fprintf(stderr, "%s: invalid nthread '%s' (%s)\n", __func__, argv[arg_idx], e.what());
return 1;
}
}
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
if (params.nthread > 0) {
fprintf(stderr, " using %d threads", params.nthread);
}
fprintf(stderr, "\n");
const int64_t t_main_start_us = llama_time_us();
int64_t t_quantize_us = 0;
// load the model
{
const int64_t t_start_us = ggml_time_us();
const int64_t t_start_us = llama_time_us();
if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype)) {
if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), &params)) {
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
return 1;
}
t_quantize_us = ggml_time_us() - t_start_us;
t_quantize_us = llama_time_us() - t_start_us;
}
// report timing
{
const int64_t t_main_end_us = ggml_time_us();
const int64_t t_main_end_us = llama_time_us();
printf("\n");
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0);

17
examples/reason-act.sh Executable file
View File

@@ -0,0 +1,17 @@
#!/bin/bash
cd `dirname $0`
cd ..
# get -m model parameter otherwise defer to default
if [ "$1" == "-m" ]; then
MODEL="-m $2 "
fi
./main $MODEL --color \
-f ./prompts/reason-act.txt \
-i --interactive-first \
--top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 -c 2048 \
-r "Question:" -r "Observation:" --in-prefix " " \
-n -1

View File

@@ -0,0 +1,7 @@
set(TARGET save-load-state)
add_executable(${TARGET} save-load-state.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View File

@@ -0,0 +1,170 @@
#include "common.h"
#include "llama.h"
#include "build-info.h"
#include <vector>
#include <cstdio>
#include <chrono>
int main(int argc, char ** argv) {
gpt_params params;
params.seed = 42;
params.n_threads = 4;
params.repeat_last_n = 64;
params.prompt = "The quick brown fox";
if (gpt_params_parse(argc, argv, params) == false) {
return 1;
}
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
if (params.n_predict < 0) {
params.n_predict = 16;
}
auto lparams = llama_context_default_params();
lparams.n_ctx = params.n_ctx;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.use_mmap = params.use_mmap;
lparams.use_mlock = params.use_mlock;
auto n_past = 0;
auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);
// init
auto model = llama_load_model_from_file(params.model.c_str(), lparams);
if (model == nullptr) {
return 1;
}
auto ctx = llama_new_context_with_model(model, lparams);
if (ctx == nullptr) {
llama_free_model(model);
return 1;
}
auto tokens = std::vector<llama_token>(params.n_ctx);
auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true);
if (n_prompt_tokens < 1) {
fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
llama_free(ctx);
llama_free_model(model);
return 1;
}
// evaluate prompt
llama_eval(ctx, tokens.data(), n_prompt_tokens, n_past, params.n_threads);
last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens);
n_past += n_prompt_tokens;
const size_t state_size = llama_get_state_size(ctx);
uint8_t * state_mem = new uint8_t[state_size];
// Save state (rng, logits, embedding and kv_cache) to file
{
FILE *fp_write = fopen("dump_state.bin", "wb");
llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file
fwrite(state_mem, 1, state_size, fp_write);
fclose(fp_write);
}
// save state (last tokens)
const auto last_n_tokens_data_saved = std::vector<llama_token>(last_n_tokens_data);
const auto n_past_saved = n_past;
// first run
printf("\n%s", params.prompt.c_str());
for (auto i = 0; i < params.n_predict; i++) {
auto logits = llama_get_logits(ctx);
auto n_vocab = llama_n_vocab(ctx);
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
auto next_token = llama_sample_token(ctx, &candidates_p);
auto next_token_str = llama_token_to_str(ctx, next_token);
last_n_tokens_data.push_back(next_token);
printf("%s", next_token_str);
if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
llama_free(ctx);
llama_free_model(model);
return 1;
}
n_past += 1;
}
printf("\n\n");
// free old context
llama_free(ctx);
// make new context
auto ctx2 = llama_new_context_with_model(model, lparams);
// Load state (rng, logits, embedding and kv_cache) from file
{
FILE *fp_read = fopen("dump_state.bin", "rb");
if (state_size != llama_get_state_size(ctx2)) {
fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
llama_free(ctx2);
llama_free_model(model);
return 1;
}
const size_t ret = fread(state_mem, 1, state_size, fp_read);
if (ret != state_size) {
fprintf(stderr, "\n%s : failed to read state\n", __func__);
llama_free(ctx2);
llama_free_model(model);
return 1;
}
llama_set_state_data(ctx2, state_mem); // could also read directly from memory mapped file
fclose(fp_read);
}
delete[] state_mem;
// restore state (last tokens)
last_n_tokens_data = last_n_tokens_data_saved;
n_past = n_past_saved;
// second run
for (auto i = 0; i < params.n_predict; i++) {
auto logits = llama_get_logits(ctx2);
auto n_vocab = llama_n_vocab(ctx2);
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
auto next_token = llama_sample_token(ctx2, &candidates_p);
auto next_token_str = llama_token_to_str(ctx2, next_token);
last_n_tokens_data.push_back(next_token);
printf("%s", next_token_str);
if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
llama_free(ctx2);
llama_free_model(model);
return 1;
}
n_past += 1;
}
printf("\n\n");
llama_free(ctx2);
llama_free_model(model);
return 0;
}

View File

@@ -0,0 +1,12 @@
set(TARGET server)
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
add_executable(${TARGET} server.cpp json.hpp httplib.h)
target_compile_definitions(${TARGET} PRIVATE
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

208
examples/server/README.md Normal file
View File

@@ -0,0 +1,208 @@
# llama.cpp/example/server
This example demonstrates a simple HTTP API server to interact with llama.cpp.
Command line options:
- `--threads N`, `-t N`: Set the number of threads to use during computation.
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
- `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
- `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`.
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
- `--port`: Set the port to listen. Default: `8080`.
- `--embedding`: Enable embedding extraction, Default: disabled.
## Build
Build llama.cpp with server from repository root with either make or CMake.
- Using `make`:
```bash
LLAMA_BUILD_SERVER=1 make
```
- Using `CMake`:
```bash
mkdir build-server
cd build-server
cmake -DLLAMA_BUILD_SERVER=ON ..
cmake --build . --config Release
```
## Quick Start
To get started right away, run the following command, making sure to use the correct path for the model you have:
### Unix-based systems (Linux, macOS, etc.):
```bash
./server -m models/7B/ggml-model.bin -c 2048
```
### Windows:
```powershell
server.exe -m models\7B\ggml-model.bin -c 2048
```
The above command will start a server that by default listens on `127.0.0.1:8080`.
You can consume the endpoints with Postman or NodeJS with axios library.
## Testing with CURL
Using [curl](https://curl.se/). On Windows `curl.exe` should be available in the base OS.
```sh
curl --request POST \
--url http://localhost:8080/completion \
--data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}'
```
## Node JS Test
You need to have [Node.js](https://nodejs.org/en) installed.
```bash
mkdir llama-client
cd llama-client
npm init
npm install axios
```
Create a index.js file and put inside this:
```javascript
const axios = require("axios");
const prompt = `Building a website can be done in 10 simple steps:`;
async function Test() {
let result = await axios.post("http://127.0.0.1:8080/completion", {
prompt,
n_predict: 512,
});
// the response is received until completion finish
console.log(result.data.content);
}
Test();
```
And run it:
```bash
node .
```
## API Endpoints
- **POST** `/completion`: Given a prompt, it returns the predicted completion.
*Options:*
`temperature`: Adjust the randomness of the generated text (default: 0.8).
`top_k`: Limit the next token selection to the K most probable tokens (default: 40).
`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
`n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: 128, -1 = infinity).
`n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context.
By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
`prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. A space is inserted in the front like main.cpp does.
`stop`: Specify a JSON array of stopping strings.
These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []).
`tfs_z`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
`typical_p`: Enable locally typical sampling with parameter p (default: 1.0, 1.0 = disabled).
`repeat_penalty`: Control the repetition of token sequences in the generated text (default: 1.1).
`repeat_last_n`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size).
`penalize_nl`: Penalize newline tokens when applying the repeat penalty (default: true).
`presence_penalty`: Repeat alpha presence penalty (default: 0.0, 0.0 = disabled).
`frequency_penalty`: Repeat alpha frequency penalty (default: 0.0, 0.0 = disabled);
`mirostat`: Enable Mirostat sampling, controlling perplexity during text generation (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0).
`mirostat_tau`: Set the Mirostat target entropy, parameter tau (default: 5.0).
`mirostat_eta`: Set the Mirostat learning rate, parameter eta (default: 0.1).
`seed`: Set the random number generator (RNG) seed (default: -1, -1 = random seed).
`ignore_eos`: Ignore end of stream token and continue generating (default: false).
`logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced (default: []).
- **POST** `/tokenize`: Tokenize a given text.
*Options:*
`content`: Set the text to tokenize.
Note that the special `BOS` token is not added in fron of the text and also a space character is not inserted automatically as it is for `/completion`.
- **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does.
*Options:*
`content`: Set the text to process.
## More examples
### Interactive mode
Check the sample in [chat.mjs](chat.mjs).
Run with NodeJS version 16 or later:
```sh
node chat.mjs
```
Another sample in [chat.sh](chat.sh).
Requires [bash](https://www.gnu.org/software/bash/), [curl](https://curl.se) and [jq](https://jqlang.github.io/jq/).
Run with bash:
```sh
bash chat.sh
```
### API like OAI
API example using Python Flask: [api_like_OAI.py](api_like_OAI.py)
This example must be used with server.cpp
```sh
python api_like_OAI.py
```
After running the API server, you can use it in Python by setting the API base URL.
```python
openai.api_base = "http://<Your api-server IP>:port"
```
Then you can utilize llama.cpp as an OpenAI's **chat.completion** or **text_completion** API

219
examples/server/api_like_OAI.py Executable file
View File

@@ -0,0 +1,219 @@
import argparse
from flask import Flask, jsonify, request, Response
import urllib.parse
import requests
import time
import json
app = Flask(__name__)
parser = argparse.ArgumentParser(description="An example of using server.cpp with a similar API to OAI. It must be used together with server.cpp.")
parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')
parser.add_argument("--user-name", type=str, help="USER name in chat completions(default: '\\nUSER: ')", default="\\nUSER: ")
parser.add_argument("--ai-name", type=str, help="ASSISTANT name in chat completions(default: '\\nASSISTANT: ')", default="\\nASSISTANT: ")
parser.add_argument("--system-name", type=str, help="SYSTEM name in chat completions(default: '\\nASSISTANT's RULE: ')", default="\\nASSISTANT's RULE: ")
parser.add_argument("--stop", type=str, help="the end of response in chat completions(default: '</s>')", default="</s>")
parser.add_argument("--llama-api", type=str, help="Set the address of server.cpp in llama.cpp(default: http://127.0.0.1:8080)", default='http://127.0.0.1:8080')
parser.add_argument("--api-key", type=str, help="Set the api key to allow only few user(default: NULL)", default="")
parser.add_argument("--host", type=str, help="Set the ip address to listen.(default: 127.0.0.1)", default='127.0.0.1')
parser.add_argument("--port", type=int, help="Set the port to listen.(default: 8081)", default=8081)
args = parser.parse_args()
def is_present(json, key):
try:
buf = json[key]
except KeyError:
return False
return True
#convert chat to prompt
def convert_chat(messages):
prompt = "" + args.chat_prompt.replace("\\n", "\n")
system_n = args.system_name.replace("\\n", "\n")
user_n = args.user_name.replace("\\n", "\n")
ai_n = args.ai_name.replace("\\n", "\n")
stop = args.stop.replace("\\n", "\n")
for line in messages:
if (line["role"] == "system"):
prompt += f"{system_n}{line['content']}"
if (line["role"] == "user"):
prompt += f"{user_n}{line['content']}"
if (line["role"] == "assistant"):
prompt += f"{ai_n}{line['content']}{stop}"
prompt += ai_n.rstrip()
return prompt
def make_postData(body, chat=False, stream=False):
postData = {}
if (chat):
postData["prompt"] = convert_chat(body["messages"])
else:
postData["prompt"] = body["prompt"]
if(is_present(body, "temperature")): postData["temperature"] = body["temperature"]
if(is_present(body, "top_k")): postData["top_k"] = body["top_k"]
if(is_present(body, "top_p")): postData["top_p"] = body["top_p"]
if(is_present(body, "max_tokens")): postData["n_predict"] = body["max_tokens"]
if(is_present(body, "presence_penalty")): postData["presence_penalty"] = body["presence_penalty"]
if(is_present(body, "frequency_penalty")): postData["frequency_penalty"] = body["frequency_penalty"]
if(is_present(body, "repeat_penalty")): postData["repeat_penalty"] = body["repeat_penalty"]
if(is_present(body, "mirostat")): postData["mirostat"] = body["mirostat"]
if(is_present(body, "mirostat_tau")): postData["mirostat_tau"] = body["mirostat_tau"]
if(is_present(body, "mirostat_eta")): postData["mirostat_eta"] = body["mirostat_eta"]
if(is_present(body, "seed")): postData["seed"] = body["seed"]
if(is_present(body, "logit_bias")): postData["logit_bias"] = [[int(token), body["logit_bias"][token]] for token in body["logit_bias"].keys()]
if (args.stop != ""):
postData["stop"] = [args.stop]
else:
postData["stop"] = []
if(is_present(body, "stop")): postData["stop"] += body["stop"]
postData["n_keep"] = -1
postData["stream"] = stream
return postData
def make_resData(data, chat=False, promptToken=[]):
resData = {
"id": "chatcmpl" if (chat) else "cmpl",
"object": "chat.completion" if (chat) else "text_completion",
"created": int(time.time()),
"truncated": data["truncated"],
"model": "LLaMA_CPP",
"usage": {
"prompt_tokens": data["tokens_evaluated"],
"completion_tokens": data["tokens_predicted"],
"total_tokens": data["tokens_evaluated"] + data["tokens_predicted"]
}
}
if (len(promptToken) != 0):
resData["promptToken"] = promptToken
if (chat):
#only one choice is supported
resData["choices"] = [{
"index": 0,
"message": {
"role": "assistant",
"content": data["content"],
},
"finish_reason": "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length"
}]
else:
#only one choice is supported
resData["choices"] = [{
"text": data["content"],
"index": 0,
"logprobs": None,
"finish_reason": "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length"
}]
return resData
def make_resData_stream(data, chat=False, time_now = 0, start=False):
resData = {
"id": "chatcmpl" if (chat) else "cmpl",
"object": "chat.completion.chunk" if (chat) else "text_completion.chunk",
"created": time_now,
"model": "LLaMA_CPP",
"choices": [
{
"finish_reason": None,
"index": 0
}
]
}
if (chat):
if (start):
resData["choices"][0]["delta"] = {
"role": "assistant"
}
else:
resData["choices"][0]["delta"] = {
"content": data["content"]
}
if (data["stop"]):
resData["choices"][0]["finish_reason"] = "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length"
else:
resData["choices"][0]["text"] = data["content"]
if (data["stop"]):
resData["choices"][0]["finish_reason"] = "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length"
return resData
@app.route('/chat/completions', methods=['POST'])
@app.route('/v1/chat/completions', methods=['POST'])
def chat_completions():
if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key):
return Response(status=403)
body = request.get_json()
stream = False
tokenize = False
if(is_present(body, "stream")): stream = body["stream"]
if(is_present(body, "tokenize")): tokenize = body["tokenize"]
postData = make_postData(body, chat=True, stream=stream)
promptToken = []
if (tokenize):
tokenData = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/tokenize"), data=json.dumps({"content": postData["prompt"]})).json()
promptToken = tokenData["tokens"]
if (not stream):
data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData))
print(data.json())
resData = make_resData(data.json(), chat=True, promptToken=promptToken)
return jsonify(resData)
else:
def generate():
data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData), stream=True)
time_now = int(time.time())
resData = make_resData_stream({}, chat=True, time_now=time_now, start=True)
yield 'data: {}\n'.format(json.dumps(resData))
for line in data.iter_lines():
if line:
decoded_line = line.decode('utf-8')
resData = make_resData_stream(json.loads(decoded_line[6:]), chat=True, time_now=time_now)
yield 'data: {}\n'.format(json.dumps(resData))
return Response(generate(), mimetype='text/event-stream')
@app.route('/completions', methods=['POST'])
@app.route('/v1/completions', methods=['POST'])
def completion():
if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key):
return Response(status=403)
body = request.get_json()
stream = False
tokenize = False
if(is_present(body, "stream")): stream = body["stream"]
if(is_present(body, "tokenize")): tokenize = body["tokenize"]
postData = make_postData(body, chat=False, stream=stream)
promptToken = []
if (tokenize):
tokenData = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/tokenize"), data=json.dumps({"content": postData["prompt"]})).json()
promptToken = tokenData["tokens"]
if (not stream):
data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData))
print(data.json())
resData = make_resData(data.json(), chat=False, promptToken=promptToken)
return jsonify(resData)
else:
def generate():
data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData), stream=True)
time_now = int(time.time())
for line in data.iter_lines():
if line:
decoded_line = line.decode('utf-8')
resData = make_resData_stream(json.loads(decoded_line[6:]), chat=False, time_now=time_now)
yield 'data: {}\n'.format(json.dumps(resData))
return Response(generate(), mimetype='text/event-stream')
if __name__ == '__main__':
app.run(args.host, port=args.port)

89
examples/server/chat.mjs Normal file
View File

@@ -0,0 +1,89 @@
import * as readline from 'node:readline'
import { stdin, stdout } from 'node:process'
const API_URL = 'http://127.0.0.1:8080'
const chat = [
{
human: "Hello, Assistant.",
assistant: "Hello. How may I help you today?"
},
{
human: "Please tell me the largest city in Europe.",
assistant: "Sure. The largest city in Europe is Moscow, the capital of Russia."
},
]
const instruction = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.`
function format_prompt(question) {
return `${instruction}\n${
chat.map(m =>`### Human: ${m.human}\n### Assistant: ${m.assistant}`).join("\n")
}\n### Human: ${question}\n### Assistant:`
}
async function tokenize(content) {
const result = await fetch(`${API_URL}/tokenize`, {
method: 'POST',
body: JSON.stringify({ content })
})
if (!result.ok) {
return []
}
return await result.json().tokens
}
const n_keep = await tokenize(instruction).length
async function chat_completion(question) {
const result = await fetch(`${API_URL}/completion`, {
method: 'POST',
body: JSON.stringify({
prompt: format_prompt(question),
temperature: 0.2,
top_k: 40,
top_p: 0.9,
n_keep: n_keep,
n_predict: 256,
stop: ["\n### Human:"], // stop completion after generating this
stream: true,
})
})
if (!result.ok) {
return
}
let answer = ''
for await (var chunk of result.body) {
const t = Buffer.from(chunk).toString('utf8')
if (t.startsWith('data: ')) {
const message = JSON.parse(t.substring(6))
answer += message.content
process.stdout.write(message.content)
if (message.stop) {
if (message.truncated) {
chat.shift()
}
break
}
}
}
process.stdout.write('\n')
chat.push({ human: question, assistant: answer.trimStart() })
}
const rl = readline.createInterface({ input: stdin, output: stdout });
const readlineQuestion = (rl, query, options) => new Promise((resolve, reject) => {
rl.question(query, options, resolve)
});
while(true) {
const question = await readlineQuestion(rl, '> ')
await chat_completion(question)
}

77
examples/server/chat.sh Normal file
View File

@@ -0,0 +1,77 @@
#!/bin/bash
API_URL="${API_URL:-http://127.0.0.1:8080}"
CHAT=(
"Hello, Assistant."
"Hello. How may I help you today?"
"Please tell me the largest city in Europe."
"Sure. The largest city in Europe is Moscow, the capital of Russia."
)
INSTRUCTION="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
trim() {
shopt -s extglob
set -- "${1##+([[:space:]])}"
printf "%s" "${1%%+([[:space:]])}"
}
trim_trailing() {
shopt -s extglob
printf "%s" "${1%%+([[:space:]])}"
}
format_prompt() {
echo -n "${INSTRUCTION}"
printf "\n### Human: %s\n### Assistant: %s" "${CHAT[@]}" "$1"
}
tokenize() {
curl \
--silent \
--request POST \
--url "${API_URL}/tokenize" \
--data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \
| jq '.tokens[]'
}
N_KEEP=$(tokenize "${INSTRUCTION}" | wc -l)
chat_completion() {
PROMPT="$(trim_trailing "$(format_prompt "$1")")"
DATA="$(echo -n "$PROMPT" | jq -Rs --argjson n_keep $N_KEEP '{
prompt: .,
temperature: 0.2,
top_k: 40,
top_p: 0.9,
n_keep: $n_keep,
n_predict: 256,
stop: ["\n### Human:"],
stream: true
}')"
ANSWER=''
while IFS= read -r LINE; do
if [[ $LINE = data:* ]]; then
CONTENT="$(echo "${LINE:5}" | jq -r '.content')"
printf "%s" "${CONTENT}"
ANSWER+="${CONTENT}"
fi
done < <(curl \
--silent \
--no-buffer \
--request POST \
--url "${API_URL}/completion" \
--data-raw "${DATA}")
printf "\n"
CHAT+=("$1" "$(trim "$ANSWER")")
}
while true; do
read -r -e -p "> " QUESTION
chat_completion "${QUESTION}"
done

View File

@@ -0,0 +1,193 @@
unsigned char completion_js[] = {
0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x44,
0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x0a,
0x20, 0x20, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x3a, 0x20, 0x74, 0x72,
0x75, 0x65, 0x2c, 0x0a, 0x20, 0x20, 0x6e, 0x5f, 0x70, 0x72, 0x65, 0x64,
0x69, 0x63, 0x74, 0x3a, 0x20, 0x35, 0x30, 0x30, 0x2c, 0x0a, 0x20, 0x20,
0x74, 0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x3a,
0x20, 0x30, 0x2e, 0x32, 0x2c, 0x0a, 0x20, 0x20, 0x73, 0x74, 0x6f, 0x70,
0x3a, 0x20, 0x5b, 0x22, 0x3c, 0x2f, 0x73, 0x3e, 0x22, 0x5d, 0x0a, 0x7d,
0x3b, 0x0a, 0x0a, 0x2f, 0x2a, 0x2a, 0x0a, 0x20, 0x2a, 0x20, 0x54, 0x68,
0x69, 0x73, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20,
0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65, 0x73, 0x20, 0x74, 0x68,
0x65, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x65, 0x78, 0x74,
0x20, 0x75, 0x73, 0x69, 0x6e, 0x67, 0x20, 0x61, 0x20, 0x6c, 0x6c, 0x61,
0x6d, 0x61, 0x20, 0x64, 0x69, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x61, 0x72,
0x79, 0x2e, 0x0a, 0x20, 0x2a, 0x20, 0x40, 0x70, 0x61, 0x72, 0x61, 0x6d,
0x20, 0x7b, 0x6f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x7d, 0x20, 0x70, 0x61,
0x72, 0x61, 0x6d, 0x73, 0x20, 0x2d, 0x20, 0x54, 0x68, 0x65, 0x20, 0x70,
0x61, 0x72, 0x61, 0x6d, 0x65, 0x74, 0x65, 0x72, 0x73, 0x20, 0x66, 0x6f,
0x72, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65,
0x74, 0x69, 0x6f, 0x6e, 0x20, 0x72, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74,
0x2e, 0x0a, 0x20, 0x2a, 0x20, 0x40, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x20,
0x7b, 0x6f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x7d, 0x20, 0x63, 0x6f, 0x6e,
0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x2d, 0x20, 0x61, 0x6e,
0x20, 0x69, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x63, 0x65, 0x20, 0x6f, 0x66,
0x20, 0x41, 0x62, 0x6f, 0x72, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f,
0x6c, 0x6c, 0x65, 0x72, 0x20, 0x69, 0x66, 0x20, 0x79, 0x6f, 0x75, 0x20,
0x6e, 0x65, 0x65, 0x64, 0x20, 0x6f, 0x6e, 0x65, 0x2c, 0x20, 0x6f, 0x72,
0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x2e, 0x0a, 0x20, 0x2a, 0x20, 0x40, 0x70,
0x61, 0x72, 0x61, 0x6d, 0x20, 0x7b, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69,
0x6f, 0x6e, 0x7d, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b,
0x20, 0x2d, 0x20, 0x54, 0x68, 0x65, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62,
0x61, 0x63, 0x6b, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
0x20, 0x74, 0x6f, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x20, 0x77, 0x68, 0x65,
0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65,
0x74, 0x69, 0x6f, 0x6e, 0x20, 0x69, 0x73, 0x20, 0x64, 0x6f, 0x6e, 0x65,
0x2e, 0x0a, 0x20, 0x2a, 0x20, 0x40, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
0x73, 0x20, 0x7b, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x7d, 0x20, 0x74,
0x68, 0x65, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65, 0x64,
0x20, 0x74, 0x65, 0x78, 0x74, 0x20, 0x61, 0x73, 0x20, 0x61, 0x20, 0x73,
0x74, 0x72, 0x69, 0x6e, 0x67, 0x2e, 0x20, 0x49, 0x64, 0x65, 0x61, 0x6c,
0x6c, 0x79, 0x20, 0x69, 0x67, 0x6e, 0x6f, 0x72, 0x65, 0x64, 0x2c, 0x20,
0x61, 0x6e, 0x64, 0x20, 0x79, 0x6f, 0x75, 0x20, 0x67, 0x65, 0x74, 0x20,
0x61, 0x74, 0x20, 0x69, 0x74, 0x20, 0x76, 0x69, 0x61, 0x20, 0x74, 0x68,
0x65, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x2e, 0x0a,
0x20, 0x2a, 0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63,
0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x43, 0x6f,
0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79,
0x6e, 0x63, 0x20, 0x28, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20,
0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2c, 0x20,
0x63, 0x61, 0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x29, 0x20, 0x3d, 0x3e,
0x20, 0x7b, 0x0a, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x63, 0x6f,
0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x29, 0x20, 0x7b, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c,
0x65, 0x72, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x41, 0x62, 0x6f,
0x72, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72,
0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x63, 0x6f,
0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69,
0x6f, 0x6e, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b,
0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x44, 0x65, 0x66,
0x61, 0x75, 0x6c, 0x74, 0x73, 0x2c, 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61,
0x72, 0x61, 0x6d, 0x73, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x2f,
0x2f, 0x20, 0x77, 0x65, 0x20, 0x75, 0x73, 0x65, 0x20, 0x66, 0x65, 0x74,
0x63, 0x68, 0x20, 0x64, 0x69, 0x72, 0x65, 0x63, 0x74, 0x6c, 0x79, 0x20,
0x68, 0x65, 0x72, 0x65, 0x20, 0x62, 0x65, 0x63, 0x61, 0x73, 0x75, 0x65,
0x20, 0x74, 0x68, 0x65, 0x20, 0x62, 0x75, 0x69, 0x6c, 0x74, 0x20, 0x69,
0x6e, 0x20, 0x66, 0x65, 0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74,
0x53, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x20, 0x64, 0x6f, 0x65, 0x73, 0x20,
0x6e, 0x6f, 0x74, 0x20, 0x73, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x20,
0x50, 0x4f, 0x53, 0x54, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
0x20, 0x72, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x20, 0x3d, 0x20,
0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x66, 0x65, 0x74, 0x63, 0x68, 0x28,
0x22, 0x2f, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e,
0x22, 0x2c, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x65, 0x74,
0x68, 0x6f, 0x64, 0x3a, 0x20, 0x27, 0x50, 0x4f, 0x53, 0x54, 0x27, 0x2c,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x62, 0x6f, 0x64, 0x79, 0x3a, 0x20, 0x4a,
0x53, 0x4f, 0x4e, 0x2e, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x69, 0x66,
0x79, 0x28, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e,
0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x29, 0x2c, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x68, 0x65, 0x61, 0x64, 0x65, 0x72, 0x73, 0x3a, 0x20, 0x7b, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x27, 0x43, 0x6f, 0x6e, 0x6e, 0x65,
0x63, 0x74, 0x69, 0x6f, 0x6e, 0x27, 0x3a, 0x20, 0x27, 0x6b, 0x65, 0x65,
0x70, 0x2d, 0x61, 0x6c, 0x69, 0x76, 0x65, 0x27, 0x2c, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x27, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
0x2d, 0x54, 0x79, 0x70, 0x65, 0x27, 0x3a, 0x20, 0x27, 0x61, 0x70, 0x70,
0x6c, 0x69, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x2f, 0x6a, 0x73, 0x6f,
0x6e, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x27, 0x41,
0x63, 0x63, 0x65, 0x70, 0x74, 0x27, 0x3a, 0x20, 0x27, 0x74, 0x65, 0x78,
0x74, 0x2f, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x2d, 0x73, 0x74, 0x72, 0x65,
0x61, 0x6d, 0x27, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x2c, 0x0a, 0x20,
0x20, 0x20, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x3a, 0x20, 0x63,
0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x73, 0x69,
0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x0a, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a,
0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x61,
0x64, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x72, 0x65, 0x73, 0x70, 0x6f, 0x6e,
0x73, 0x65, 0x2e, 0x62, 0x6f, 0x64, 0x79, 0x2e, 0x67, 0x65, 0x74, 0x52,
0x65, 0x61, 0x64, 0x65, 0x72, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x63,
0x6f, 0x6e, 0x73, 0x74, 0x20, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x72,
0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x54, 0x65, 0x78, 0x74, 0x44,
0x65, 0x63, 0x6f, 0x64, 0x65, 0x72, 0x28, 0x29, 0x3b, 0x0a, 0x0a, 0x20,
0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x74, 0x72,
0x79, 0x20, 0x7b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74,
0x20, 0x63, 0x6f, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x74, 0x72, 0x75, 0x65,
0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65,
0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65,
0x73, 0x75, 0x6c, 0x74, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74,
0x20, 0x72, 0x65, 0x61, 0x64, 0x65, 0x72, 0x2e, 0x72, 0x65, 0x61, 0x64,
0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66,
0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x6f, 0x6e,
0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f,
0x2f, 0x20, 0x73, 0x73, 0x65, 0x20, 0x61, 0x6e, 0x73, 0x77, 0x65, 0x72,
0x73, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, 0x66, 0x6f, 0x72,
0x6d, 0x20, 0x6d, 0x75, 0x6c, 0x74, 0x69, 0x70, 0x6c, 0x65, 0x20, 0x6c,
0x69, 0x6e, 0x65, 0x73, 0x20, 0x6f, 0x66, 0x3a, 0x20, 0x76, 0x61, 0x6c,
0x75, 0x65, 0x5c, 0x6e, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x64, 0x61,
0x74, 0x61, 0x20, 0x61, 0x6c, 0x77, 0x61, 0x79, 0x73, 0x20, 0x70, 0x72,
0x65, 0x73, 0x65, 0x6e, 0x74, 0x20, 0x61, 0x73, 0x20, 0x61, 0x20, 0x6b,
0x65, 0x79, 0x2e, 0x20, 0x69, 0x6e, 0x20, 0x6f, 0x75, 0x72, 0x20, 0x63,
0x61, 0x73, 0x65, 0x20, 0x77, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x2f, 0x2f, 0x20, 0x6d, 0x61, 0x69, 0x6e, 0x6c, 0x79, 0x20, 0x63,
0x61, 0x72, 0x65, 0x20, 0x61, 0x62, 0x6f, 0x75, 0x74, 0x20, 0x74, 0x68,
0x65, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3a, 0x20, 0x6b, 0x65, 0x79, 0x20,
0x68, 0x65, 0x72, 0x65, 0x2c, 0x20, 0x77, 0x68, 0x69, 0x63, 0x68, 0x20,
0x77, 0x65, 0x20, 0x65, 0x78, 0x70, 0x65, 0x63, 0x74, 0x20, 0x61, 0x73,
0x20, 0x6a, 0x73, 0x6f, 0x6e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, 0x65, 0x78, 0x74, 0x20, 0x3d,
0x20, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x72, 0x2e, 0x64, 0x65, 0x63,
0x6f, 0x64, 0x65, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x76,
0x61, 0x6c, 0x75, 0x65, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x2f, 0x2f, 0x20, 0x70, 0x61, 0x72, 0x73, 0x65, 0x20, 0x61,
0x6c, 0x6c, 0x20, 0x73, 0x73, 0x65, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74,
0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x61, 0x64, 0x64, 0x20, 0x74, 0x68,
0x65, 0x6d, 0x20, 0x74, 0x6f, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
0x20, 0x72, 0x65, 0x67, 0x65, 0x78, 0x20, 0x3d, 0x20, 0x2f, 0x5e, 0x28,
0x5c, 0x53, 0x2b, 0x29, 0x3a, 0x5c, 0x73, 0x28, 0x2e, 0x2a, 0x29, 0x24,
0x2f, 0x67, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66,
0x6f, 0x72, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6d, 0x61,
0x74, 0x63, 0x68, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2e,
0x6d, 0x61, 0x74, 0x63, 0x68, 0x41, 0x6c, 0x6c, 0x28, 0x72, 0x65, 0x67,
0x65, 0x78, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x5b, 0x6d, 0x61,
0x74, 0x63, 0x68, 0x5b, 0x31, 0x5d, 0x5d, 0x20, 0x3d, 0x20, 0x6d, 0x61,
0x74, 0x63, 0x68, 0x5b, 0x32, 0x5d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f,
0x20, 0x73, 0x69, 0x6e, 0x63, 0x65, 0x20, 0x77, 0x65, 0x20, 0x6b, 0x6e,
0x6f, 0x77, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x6c,
0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, 0x2c, 0x20, 0x6c, 0x65,
0x74, 0x27, 0x73, 0x20, 0x6a, 0x75, 0x73, 0x74, 0x20, 0x64, 0x65, 0x63,
0x6f, 0x64, 0x65, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6a, 0x73, 0x6f, 0x6e,
0x20, 0x69, 0x6e, 0x20, 0x64, 0x61, 0x74, 0x61, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61,
0x74, 0x61, 0x20, 0x3d, 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x70, 0x61,
0x72, 0x73, 0x65, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64,
0x61, 0x74, 0x61, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x72,
0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63,
0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x61, 0x63,
0x6b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28,
0x63, 0x61, 0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x29, 0x20, 0x7b, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74,
0x20, 0x3d, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x28,
0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x29, 0x20, 0x21, 0x3d, 0x20, 0x66,
0x61, 0x6c, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20,
0x69, 0x66, 0x20, 0x77, 0x65, 0x20, 0x67, 0x6f, 0x74, 0x20, 0x61, 0x20,
0x73, 0x74, 0x6f, 0x70, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x20, 0x66,
0x72, 0x6f, 0x6d, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2c, 0x20,
0x77, 0x65, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x72, 0x65, 0x61,
0x6b, 0x20, 0x68, 0x65, 0x72, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e,
0x64, 0x61, 0x74, 0x61, 0x2e, 0x73, 0x74, 0x6f, 0x70, 0x29, 0x20, 0x7b,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x72, 0x65,
0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61,
0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x65, 0x72,
0x72, 0x6f, 0x72, 0x28, 0x22, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x20, 0x65,
0x72, 0x72, 0x6f, 0x72, 0x3a, 0x20, 0x22, 0x2c, 0x20, 0x65, 0x29, 0x3b,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x65,
0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x66, 0x69, 0x6e, 0x61,
0x6c, 0x6c, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x61, 0x62, 0x6f,
0x72, 0x74, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20,
0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x63, 0x6f, 0x6e, 0x74,
0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x7d, 0x0a
};
unsigned int completion_js_len = 2275;

22
examples/server/deps.sh Executable file
View File

@@ -0,0 +1,22 @@
#!/bin/bash
# Download and update deps for binary
# get the directory of this script file
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
PUBLIC=$DIR/public
OUTPUT=$DIR/templats.hpp
echo "// Generated file, do not edit" > $OUTPUT
echo "" > $OUTPUT
echo "download js bundle files"
curl https://npm.reversehttp.com/@preact/signals-core,@preact/signals,htm/preact,preact,preact/hooks > $PUBLIC/index.js
echo >> $PUBLIC/index.js # add newline
FILES=$(ls $PUBLIC)
for FILE in $FILES; do
func=$(echo $FILE | tr '.' '_')
echo "generate $FILE.hpp ($func)"
xxd -n $func -i $PUBLIC/$FILE > $DIR/$FILE.hpp
done

8794
examples/server/httplib.h Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,846 @@
unsigned char index_html[] = {
0x3c, 0x68, 0x74, 0x6d, 0x6c, 0x3e, 0x0a, 0x0a, 0x3c, 0x68, 0x65, 0x61,
0x64, 0x3e, 0x0a, 0x20, 0x20, 0x3c, 0x6d, 0x65, 0x74, 0x61, 0x20, 0x63,
0x68, 0x61, 0x72, 0x73, 0x65, 0x74, 0x3d, 0x22, 0x55, 0x54, 0x46, 0x2d,
0x38, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x3c, 0x6d, 0x65, 0x74, 0x61, 0x20,
0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x76, 0x69, 0x65, 0x77, 0x70, 0x6f,
0x72, 0x74, 0x22, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3d,
0x22, 0x77, 0x69, 0x64, 0x74, 0x68, 0x3d, 0x64, 0x65, 0x76, 0x69, 0x63,
0x65, 0x2d, 0x77, 0x69, 0x64, 0x74, 0x68, 0x2c, 0x20, 0x69, 0x6e, 0x69,
0x74, 0x69, 0x61, 0x6c, 0x2d, 0x73, 0x63, 0x61, 0x6c, 0x65, 0x3d, 0x31,
0x2c, 0x20, 0x6d, 0x61, 0x78, 0x69, 0x6d, 0x75, 0x6d, 0x2d, 0x73, 0x63,
0x61, 0x6c, 0x65, 0x3d, 0x31, 0x22, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20,
0x3c, 0x74, 0x69, 0x74, 0x6c, 0x65, 0x3e, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
0x2e, 0x63, 0x70, 0x70, 0x20, 0x2d, 0x20, 0x63, 0x68, 0x61, 0x74, 0x3c,
0x2f, 0x74, 0x69, 0x74, 0x6c, 0x65, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x3c,
0x73, 0x74, 0x79, 0x6c, 0x65, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x62, 0x6f, 0x64, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x62, 0x61, 0x63, 0x6b, 0x67, 0x72, 0x6f, 0x75, 0x6e, 0x64, 0x2d,
0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x3a, 0x20, 0x23, 0x66, 0x66, 0x66, 0x3b,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6c, 0x6f, 0x72,
0x3a, 0x20, 0x23, 0x30, 0x30, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x66, 0x6f, 0x6e, 0x74, 0x2d, 0x66, 0x61, 0x6d, 0x69, 0x6c,
0x79, 0x3a, 0x20, 0x73, 0x79, 0x73, 0x74, 0x65, 0x6d, 0x2d, 0x75, 0x69,
0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x6e, 0x74,
0x2d, 0x73, 0x69, 0x7a, 0x65, 0x3a, 0x20, 0x39, 0x30, 0x25, 0x3b, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x23,
0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x20, 0x7b, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e,
0x3a, 0x20, 0x30, 0x65, 0x6d, 0x20, 0x61, 0x75, 0x74, 0x6f, 0x3b, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61,
0x79, 0x3a, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d, 0x64, 0x69, 0x72, 0x65,
0x63, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x63, 0x6f, 0x6c, 0x75, 0x6d,
0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6a, 0x75, 0x73,
0x74, 0x69, 0x66, 0x79, 0x2d, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
0x3a, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x2d, 0x62, 0x65, 0x74, 0x77,
0x65, 0x65, 0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68,
0x65, 0x69, 0x67, 0x68, 0x74, 0x3a, 0x20, 0x31, 0x30, 0x30, 0x25, 0x3b,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x68, 0x65, 0x61, 0x64, 0x65, 0x72, 0x2c, 0x20, 0x66, 0x6f, 0x6f, 0x74,
0x65, 0x72, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74,
0x65, 0x78, 0x74, 0x2d, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x3a, 0x20, 0x63,
0x65, 0x6e, 0x74, 0x65, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d,
0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x69, 0x6e, 0x20, 0x7b,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69,
0x6e, 0x3a, 0x20, 0x33, 0x70, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61, 0x79, 0x3a, 0x20, 0x66,
0x6c, 0x65, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66,
0x6c, 0x65, 0x78, 0x2d, 0x64, 0x69, 0x72, 0x65, 0x63, 0x74, 0x69, 0x6f,
0x6e, 0x3a, 0x20, 0x63, 0x6f, 0x6c, 0x75, 0x6d, 0x6e, 0x3b, 0x0a, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x6a, 0x75, 0x73, 0x74, 0x69, 0x66, 0x79,
0x2d, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3a, 0x20, 0x73, 0x70,
0x61, 0x63, 0x65, 0x2d, 0x62, 0x65, 0x74, 0x77, 0x65, 0x65, 0x6e, 0x3b,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x70, 0x3a, 0x20,
0x31, 0x65, 0x6d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x66, 0x6c, 0x65, 0x78, 0x2d, 0x67, 0x72, 0x6f, 0x77, 0x3a, 0x20, 0x31,
0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x76, 0x65, 0x72,
0x66, 0x6c, 0x6f, 0x77, 0x2d, 0x79, 0x3a, 0x20, 0x61, 0x75, 0x74, 0x6f,
0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x6f, 0x72,
0x64, 0x65, 0x72, 0x3a, 0x20, 0x31, 0x70, 0x78, 0x20, 0x73, 0x6f, 0x6c,
0x69, 0x64, 0x20, 0x23, 0x63, 0x63, 0x63, 0x3b, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x62, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x2d, 0x72, 0x61,
0x64, 0x69, 0x75, 0x73, 0x3a, 0x20, 0x35, 0x70, 0x78, 0x3b, 0x0a, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67,
0x3a, 0x20, 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x62, 0x6f, 0x64, 0x79,
0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x78,
0x2d, 0x77, 0x69, 0x64, 0x74, 0x68, 0x3a, 0x20, 0x36, 0x30, 0x30, 0x70,
0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x69, 0x6e,
0x2d, 0x77, 0x69, 0x64, 0x74, 0x68, 0x3a, 0x20, 0x33, 0x30, 0x30, 0x70,
0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x69, 0x6e,
0x65, 0x2d, 0x68, 0x65, 0x69, 0x67, 0x68, 0x74, 0x3a, 0x20, 0x31, 0x2e,
0x32, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72,
0x67, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x20, 0x61, 0x75, 0x74, 0x6f, 0x3b,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x61, 0x64, 0x64, 0x69,
0x6e, 0x67, 0x3a, 0x20, 0x30, 0x20, 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x3b,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x70, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x76,
0x65, 0x72, 0x66, 0x6c, 0x6f, 0x77, 0x2d, 0x77, 0x72, 0x61, 0x70, 0x3a,
0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x2d, 0x77, 0x6f, 0x72, 0x64, 0x3b,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x77, 0x6f, 0x72, 0x64, 0x2d,
0x77, 0x72, 0x61, 0x70, 0x3a, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x2d,
0x77, 0x6f, 0x72, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x68, 0x79, 0x70, 0x68, 0x65, 0x6e, 0x73, 0x3a, 0x20, 0x61, 0x75, 0x74,
0x6f, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72,
0x67, 0x69, 0x6e, 0x2d, 0x74, 0x6f, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x35,
0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61,
0x72, 0x67, 0x69, 0x6e, 0x2d, 0x62, 0x6f, 0x74, 0x74, 0x6f, 0x6d, 0x3a,
0x20, 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x23, 0x77, 0x72, 0x69, 0x74,
0x65, 0x20, 0x66, 0x6f, 0x72, 0x6d, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e, 0x3a, 0x20, 0x31,
0x65, 0x6d, 0x20, 0x30, 0x20, 0x30, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61, 0x79, 0x3a,
0x20, 0x66, 0x6c, 0x65, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d, 0x64, 0x69, 0x72, 0x65, 0x63, 0x74,
0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x63, 0x6f, 0x6c, 0x75, 0x6d, 0x6e, 0x3b,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x70, 0x3a, 0x20,
0x30, 0x2e, 0x35, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x2d, 0x69, 0x74, 0x65, 0x6d, 0x73,
0x3a, 0x20, 0x73, 0x74, 0x72, 0x65, 0x74, 0x63, 0x68, 0x3b, 0x0a, 0x20,
0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72,
0x69, 0x67, 0x68, 0x74, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61, 0x79, 0x3a, 0x20, 0x66, 0x6c,
0x65, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c,
0x65, 0x78, 0x2d, 0x64, 0x69, 0x72, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e,
0x3a, 0x20, 0x72, 0x6f, 0x77, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x67, 0x61, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x3b,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6a, 0x75, 0x73, 0x74, 0x69,
0x66, 0x79, 0x2d, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3a, 0x20,
0x66, 0x6c, 0x65, 0x78, 0x2d, 0x65, 0x6e, 0x64, 0x3b, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x69, 0x65,
0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x62, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x3a, 0x20, 0x6e, 0x6f,
0x6e, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x61,
0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e, 0x3a, 0x20,
0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65, 0x61, 0x20, 0x7b,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x61, 0x64, 0x64, 0x69,
0x6e, 0x67, 0x3a, 0x20, 0x35, 0x70, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d, 0x67, 0x72, 0x6f, 0x77,
0x3a, 0x20, 0x31, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x77,
0x69, 0x64, 0x74, 0x68, 0x3a, 0x20, 0x31, 0x30, 0x30, 0x25, 0x3b, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70,
0x72, 0x65, 0x20, 0x63, 0x6f, 0x64, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61, 0x79, 0x3a,
0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x62, 0x61, 0x63, 0x6b, 0x67, 0x72, 0x6f, 0x75, 0x6e, 0x64,
0x2d, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x3a, 0x20, 0x23, 0x32, 0x32, 0x32,
0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6c, 0x6f,
0x72, 0x3a, 0x20, 0x23, 0x64, 0x64, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x64, 0x65, 0x20,
0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x6e, 0x74,
0x2d, 0x66, 0x61, 0x6d, 0x69, 0x6c, 0x79, 0x3a, 0x20, 0x6d, 0x6f, 0x6e,
0x6f, 0x73, 0x70, 0x61, 0x63, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x30,
0x2e, 0x31, 0x65, 0x6d, 0x20, 0x30, 0x2e, 0x33, 0x65, 0x6d, 0x3b, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x6f, 0x72, 0x64, 0x65, 0x72,
0x2d, 0x72, 0x61, 0x64, 0x69, 0x75, 0x73, 0x3a, 0x20, 0x33, 0x70, 0x78,
0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, 0x6c, 0x61,
0x62, 0x65, 0x6c, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x35, 0x65,
0x6d, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64,
0x69, 0x73, 0x70, 0x6c, 0x61, 0x79, 0x3a, 0x20, 0x62, 0x6c, 0x6f, 0x63,
0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x3c,
0x2f, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x3c,
0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d,
0x22, 0x6d, 0x6f, 0x64, 0x75, 0x6c, 0x65, 0x22, 0x3e, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x7b, 0x0a, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x2c, 0x20, 0x68,
0x2c, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x20, 0x65, 0x66,
0x66, 0x65, 0x63, 0x74, 0x2c, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x75, 0x74,
0x65, 0x64, 0x2c, 0x20, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x2c, 0x20,
0x75, 0x73, 0x65, 0x53, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x20, 0x75,
0x73, 0x65, 0x45, 0x66, 0x66, 0x65, 0x63, 0x74, 0x2c, 0x20, 0x75, 0x73,
0x65, 0x52, 0x65, 0x66, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x66,
0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x2e,
0x6a, 0x73, 0x27, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d,
0x70, 0x6f, 0x72, 0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65, 0x20, 0x7d, 0x20, 0x66,
0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65,
0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x6a, 0x73, 0x27, 0x3b, 0x0a, 0x0a, 0x20,
0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73, 0x65, 0x73,
0x73, 0x69, 0x6f, 0x6e, 0x20, 0x3d, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61,
0x6c, 0x28, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x72,
0x6f, 0x6d, 0x70, 0x74, 0x3a, 0x20, 0x22, 0x54, 0x68, 0x69, 0x73, 0x20,
0x69, 0x73, 0x20, 0x61, 0x20, 0x63, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x73,
0x61, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x62, 0x65, 0x74, 0x77, 0x65, 0x65,
0x6e, 0x20, 0x75, 0x73, 0x65, 0x72, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x6c,
0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x61, 0x20, 0x66, 0x72, 0x69, 0x65,
0x6e, 0x64, 0x6c, 0x79, 0x20, 0x63, 0x68, 0x61, 0x74, 0x62, 0x6f, 0x74,
0x2e, 0x20, 0x72, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x64, 0x20, 0x69, 0x6e,
0x20, 0x6d, 0x61, 0x72, 0x6b, 0x64, 0x6f, 0x77, 0x6e, 0x2e, 0x22, 0x2c,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c,
0x61, 0x74, 0x65, 0x3a, 0x20, 0x22, 0x7b, 0x7b, 0x70, 0x72, 0x6f, 0x6d,
0x70, 0x74, 0x7d, 0x7d, 0x5c, 0x6e, 0x5c, 0x6e, 0x7b, 0x7b, 0x68, 0x69,
0x73, 0x74, 0x6f, 0x72, 0x79, 0x7d, 0x7d, 0x5c, 0x6e, 0x7b, 0x7b, 0x63,
0x68, 0x61, 0x72, 0x7d, 0x7d, 0x3a, 0x22, 0x2c, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x54, 0x65,
0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x3a, 0x20, 0x22, 0x7b, 0x7b, 0x6e,
0x61, 0x6d, 0x65, 0x7d, 0x7d, 0x3a, 0x20, 0x7b, 0x7b, 0x6d, 0x65, 0x73,
0x73, 0x61, 0x67, 0x65, 0x7d, 0x7d, 0x22, 0x2c, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70,
0x74, 0x3a, 0x20, 0x5b, 0x5d, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x74, 0x79, 0x70, 0x65, 0x3a, 0x20, 0x22, 0x63, 0x68, 0x61, 0x74,
0x22, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x68, 0x61,
0x72, 0x3a, 0x20, 0x22, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x22, 0x2c, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x72, 0x3a, 0x20,
0x22, 0x55, 0x73, 0x65, 0x72, 0x22, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x7d, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73,
0x74, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74,
0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x74, 0x72,
0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x29, 0x20, 0x3d, 0x3e,
0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x65, 0x73,
0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d,
0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e,
0x2e, 0x2e, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61,
0x6c, 0x75, 0x65, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
0x20, 0x63, 0x68, 0x61, 0x74, 0x53, 0x74, 0x61, 0x72, 0x74, 0x65, 0x64,
0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x75, 0x74, 0x65, 0x64, 0x28,
0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f,
0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x72, 0x61, 0x6e,
0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74,
0x68, 0x20, 0x3e, 0x20, 0x30, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73,
0x20, 0x3d, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x28, 0x7b, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x5f, 0x70, 0x72, 0x65, 0x64,
0x69, 0x63, 0x74, 0x3a, 0x20, 0x34, 0x30, 0x30, 0x2c, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74,
0x75, 0x72, 0x65, 0x3a, 0x20, 0x30, 0x2e, 0x37, 0x2c, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x6c,
0x61, 0x73, 0x74, 0x5f, 0x6e, 0x3a, 0x20, 0x32, 0x35, 0x36, 0x2c, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74,
0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x3a, 0x20, 0x31, 0x2e,
0x31, 0x38, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x6f,
0x70, 0x5f, 0x6b, 0x3a, 0x20, 0x34, 0x30, 0x2c, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x74, 0x6f, 0x70, 0x5f, 0x70, 0x3a, 0x20, 0x30, 0x2e,
0x35, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x0a, 0x20,
0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e,
0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x73, 0x69,
0x67, 0x6e, 0x61, 0x6c, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x0a, 0x20,
0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x67, 0x65, 0x6e,
0x65, 0x72, 0x61, 0x74, 0x69, 0x6e, 0x67, 0x20, 0x3d, 0x20, 0x63, 0x6f,
0x6d, 0x70, 0x75, 0x74, 0x65, 0x64, 0x28, 0x28, 0x29, 0x20, 0x3d, 0x3e,
0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e,
0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x3d, 0x20, 0x6e, 0x75, 0x6c,
0x6c, 0x20, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20,
0x73, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c,
0x61, 0x74, 0x65, 0x20, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, 0x65,
0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x73, 0x74,
0x72, 0x2c, 0x20, 0x65, 0x78, 0x74, 0x72, 0x61, 0x53, 0x65, 0x74, 0x74,
0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x73, 0x65, 0x74,
0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x73, 0x65, 0x73, 0x73,
0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3b, 0x0a, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65, 0x78, 0x74,
0x72, 0x61, 0x53, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20,
0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x65,
0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x2e,
0x2e, 0x2e, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x2c, 0x20,
0x2e, 0x2e, 0x2e, 0x65, 0x78, 0x74, 0x72, 0x61, 0x53, 0x65, 0x74, 0x74,
0x69, 0x6e, 0x67, 0x73, 0x20, 0x7d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65,
0x74, 0x75, 0x72, 0x6e, 0x20, 0x53, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x28,
0x73, 0x74, 0x72, 0x29, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65,
0x41, 0x6c, 0x6c, 0x28, 0x2f, 0x5c, 0x7b, 0x5c, 0x7b, 0x28, 0x2e, 0x2a,
0x3f, 0x29, 0x5c, 0x7d, 0x5c, 0x7d, 0x2f, 0x67, 0x2c, 0x20, 0x28, 0x5f,
0x2c, 0x20, 0x6b, 0x65, 0x79, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x74, 0x65,
0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x73, 0x65, 0x74, 0x74, 0x69,
0x6e, 0x67, 0x73, 0x5b, 0x6b, 0x65, 0x79, 0x5d, 0x29, 0x29, 0x3b, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f,
0x2f, 0x20, 0x73, 0x65, 0x6e, 0x64, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61,
0x67, 0x65, 0x20, 0x74, 0x6f, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63,
0x68, 0x61, 0x74, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20,
0x28, 0x6d, 0x73, 0x67, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x6f, 0x6e,
0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75,
0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67,
0x28, 0x27, 0x61, 0x6c, 0x72, 0x65, 0x61, 0x64, 0x79, 0x20, 0x72, 0x75,
0x6e, 0x6e, 0x69, 0x6e, 0x67, 0x2e, 0x2e, 0x2e, 0x27, 0x29, 0x3b, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75,
0x72, 0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f,
0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d,
0x20, 0x6e, 0x65, 0x77, 0x20, 0x41, 0x62, 0x6f, 0x72, 0x74, 0x43, 0x6f,
0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x28, 0x29, 0x3b, 0x0a,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73,
0x63, 0x72, 0x69, 0x70, 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x28,
0x5b, 0x2e, 0x2e, 0x2e, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e,
0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63,
0x72, 0x69, 0x70, 0x74, 0x2c, 0x20, 0x5b, 0x22, 0x7b, 0x7b, 0x75, 0x73,
0x65, 0x72, 0x7d, 0x7d, 0x22, 0x2c, 0x20, 0x6d, 0x73, 0x67, 0x5d, 0x5d,
0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
0x73, 0x74, 0x20, 0x70, 0x61, 0x79, 0x6c, 0x6f, 0x61, 0x64, 0x20, 0x3d,
0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x73, 0x65,
0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e,
0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2c, 0x20, 0x7b, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x65, 0x73, 0x73,
0x61, 0x67, 0x65, 0x3a, 0x20, 0x6d, 0x73, 0x67, 0x2c, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72,
0x79, 0x3a, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76,
0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72,
0x69, 0x70, 0x74, 0x2e, 0x66, 0x6c, 0x61, 0x74, 0x4d, 0x61, 0x70, 0x28,
0x28, 0x5b, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x6d, 0x65, 0x73, 0x73,
0x61, 0x67, 0x65, 0x5d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x74, 0x65, 0x6d,
0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f,
0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x68, 0x69, 0x73, 0x74,
0x6f, 0x72, 0x79, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2c,
0x20, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x6d, 0x65, 0x73, 0x73,
0x61, 0x67, 0x65, 0x7d, 0x29, 0x29, 0x2e, 0x6a, 0x6f, 0x69, 0x6e, 0x28,
0x22, 0x5c, 0x6e, 0x22, 0x29, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x6c, 0x65, 0x74, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d,
0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x20, 0x3d, 0x20, 0x27, 0x27, 0x3b,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
0x20, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x20, 0x3d, 0x20, 0x73,
0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65,
0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x0a,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73,
0x20, 0x3d, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76,
0x61, 0x6c, 0x75, 0x65, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x3a, 0x20, 0x70, 0x61,
0x79, 0x6c, 0x6f, 0x61, 0x64, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x73, 0x74, 0x6f, 0x70, 0x3a, 0x20, 0x5b, 0x22, 0x3c,
0x2f, 0x73, 0x3e, 0x22, 0x2c, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61,
0x74, 0x65, 0x28, 0x22, 0x7b, 0x7b, 0x63, 0x68, 0x61, 0x72, 0x7d, 0x7d,
0x3a, 0x22, 0x29, 0x2c, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74,
0x65, 0x28, 0x22, 0x7b, 0x7b, 0x75, 0x73, 0x65, 0x72, 0x7d, 0x7d, 0x3a,
0x22, 0x29, 0x5d, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x77, 0x61, 0x69,
0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x43, 0x6f, 0x6d, 0x70, 0x6c,
0x65, 0x74, 0x65, 0x28, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x61, 0x72,
0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c,
0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x28,
0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20,
0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
0x6e, 0x73, 0x74, 0x20, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x6d,
0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x3b,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x75, 0x72,
0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x20,
0x2b, 0x3d, 0x20, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74,
0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x2f, 0x2f, 0x20, 0x72, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x20, 0x6c,
0x65, 0x61, 0x64, 0x69, 0x6e, 0x67, 0x20, 0x77, 0x68, 0x69, 0x74, 0x65,
0x73, 0x70, 0x61, 0x63, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73,
0x73, 0x61, 0x67, 0x65, 0x20, 0x3d, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65,
0x6e, 0x74, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x2e, 0x72, 0x65,
0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5e, 0x5c, 0x73, 0x2b, 0x2f,
0x2c, 0x20, 0x22, 0x22, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70,
0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x28, 0x5b, 0x2e, 0x2e, 0x2e,
0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x2c, 0x20, 0x5b, 0x22, 0x7b,
0x7b, 0x63, 0x68, 0x61, 0x72, 0x7d, 0x7d, 0x22, 0x2c, 0x20, 0x63, 0x75,
0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65,
0x5d, 0x5d, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x69, 0x66, 0x20, 0x28, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x73, 0x74,
0x6f, 0x70, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e,
0x6c, 0x6f, 0x67, 0x28, 0x22, 0x2d, 0x2d, 0x3e, 0x22, 0x2c, 0x20, 0x64,
0x61, 0x74, 0x61, 0x2c, 0x20, 0x27, 0x20, 0x72, 0x65, 0x73, 0x70, 0x6f,
0x6e, 0x73, 0x65, 0x20, 0x77, 0x61, 0x73, 0x3a, 0x27, 0x2c, 0x20, 0x63,
0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67,
0x65, 0x2c, 0x20, 0x27, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69,
0x70, 0x74, 0x20, 0x73, 0x74, 0x61, 0x74, 0x65, 0x3a, 0x27, 0x2c, 0x20,
0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75,
0x65, 0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74,
0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x0a, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c,
0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20,
0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f,
0x6e, 0x20, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x49, 0x6e, 0x70,
0x75, 0x74, 0x28, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61,
0x67, 0x65, 0x20, 0x3d, 0x20, 0x75, 0x73, 0x65, 0x53, 0x69, 0x67, 0x6e,
0x61, 0x6c, 0x28, 0x22, 0x22, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73, 0x74, 0x6f, 0x70,
0x20, 0x3d, 0x20, 0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x2e, 0x70, 0x72,
0x65, 0x76, 0x65, 0x6e, 0x74, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74,
0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x69, 0x66, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c,
0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x20, 0x7b, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c,
0x75, 0x65, 0x2e, 0x61, 0x62, 0x6f, 0x72, 0x74, 0x28, 0x29, 0x3b, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c,
0x75, 0x65, 0x20, 0x3d, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x0a, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x73, 0x65, 0x74, 0x20,
0x3d, 0x20, 0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x6f, 0x70, 0x28,
0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x55, 0x70,
0x64, 0x61, 0x74, 0x65, 0x28, 0x5b, 0x5d, 0x29, 0x3b, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73, 0x75, 0x62, 0x6d, 0x69,
0x74, 0x20, 0x3d, 0x20, 0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x6f,
0x70, 0x28, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x63, 0x68, 0x61, 0x74, 0x28, 0x6d, 0x65, 0x73, 0x73, 0x61,
0x67, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x3b, 0x0a, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61,
0x67, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x22,
0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
0x65, 0x6e, 0x74, 0x65, 0x72, 0x53, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x73,
0x20, 0x3d, 0x20, 0x28, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x3d,
0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x69, 0x66, 0x20, 0x28, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x68,
0x69, 0x63, 0x68, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x31, 0x33, 0x20, 0x26,
0x26, 0x20, 0x21, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x2e, 0x73, 0x68, 0x69,
0x66, 0x74, 0x4b, 0x65, 0x79, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x75, 0x62, 0x6d, 0x69,
0x74, 0x28, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x29, 0x3b, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72,
0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x6f, 0x72,
0x6d, 0x20, 0x6f, 0x6e, 0x73, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x3d, 0x24,
0x7b, 0x73, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x7d, 0x3e, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76,
0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x3c, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65, 0x61, 0x20, 0x74, 0x79,
0x70, 0x65, 0x3d, 0x22, 0x74, 0x65, 0x78, 0x74, 0x22, 0x20, 0x72, 0x6f,
0x77, 0x73, 0x3d, 0x32, 0x20, 0x6f, 0x6e, 0x6b, 0x65, 0x79, 0x70, 0x72,
0x65, 0x73, 0x73, 0x3d, 0x24, 0x7b, 0x65, 0x6e, 0x74, 0x65, 0x72, 0x53,
0x75, 0x62, 0x6d, 0x69, 0x74, 0x73, 0x7d, 0x20, 0x76, 0x61, 0x6c, 0x75,
0x65, 0x3d, 0x22, 0x24, 0x7b, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65,
0x7d, 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24,
0x7b, 0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x6d, 0x65, 0x73, 0x73,
0x61, 0x67, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20,
0x65, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x76, 0x61, 0x6c,
0x75, 0x65, 0x7d, 0x20, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x68, 0x6f, 0x6c,
0x64, 0x65, 0x72, 0x3d, 0x22, 0x53, 0x61, 0x79, 0x20, 0x73, 0x6f, 0x6d,
0x65, 0x74, 0x68, 0x69, 0x6e, 0x67, 0x2e, 0x2e, 0x2e, 0x22, 0x2f, 0x3e,
0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x20, 0x63, 0x6c,
0x61, 0x73, 0x73, 0x3d, 0x22, 0x72, 0x69, 0x67, 0x68, 0x74, 0x22, 0x3e,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d,
0x22, 0x73, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x22, 0x20, 0x64, 0x69, 0x73,
0x61, 0x62, 0x6c, 0x65, 0x64, 0x3d, 0x24, 0x7b, 0x21, 0x67, 0x65, 0x6e,
0x65, 0x72, 0x61, 0x74, 0x69, 0x6e, 0x67, 0x2e, 0x76, 0x61, 0x6c, 0x75,
0x65, 0x7d, 0x20, 0x3e, 0x53, 0x65, 0x6e, 0x64, 0x3c, 0x2f, 0x62, 0x75,
0x74, 0x74, 0x6f, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x3c, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x20,
0x6f, 0x6e, 0x63, 0x6c, 0x69, 0x63, 0x6b, 0x3d, 0x24, 0x7b, 0x73, 0x74,
0x6f, 0x70, 0x7d, 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64,
0x3d, 0x24, 0x7b, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6e,
0x67, 0x7d, 0x3e, 0x53, 0x74, 0x6f, 0x70, 0x3c, 0x2f, 0x62, 0x75, 0x74,
0x74, 0x6f, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x3c, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x20, 0x6f,
0x6e, 0x63, 0x6c, 0x69, 0x63, 0x6b, 0x3d, 0x24, 0x7b, 0x72, 0x65, 0x73,
0x65, 0x74, 0x7d, 0x3e, 0x52, 0x65, 0x73, 0x65, 0x74, 0x3c, 0x2f, 0x62,
0x75, 0x74, 0x74, 0x6f, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x6f,
0x72, 0x6d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63,
0x6f, 0x6e, 0x73, 0x74, 0x20, 0x43, 0x68, 0x61, 0x74, 0x4c, 0x6f, 0x67,
0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x29, 0x20, 0x3d,
0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
0x6e, 0x73, 0x74, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x73,
0x20, 0x3d, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76,
0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72,
0x69, 0x70, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63,
0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e,
0x65, 0x72, 0x20, 0x3d, 0x20, 0x75, 0x73, 0x65, 0x52, 0x65, 0x66, 0x28,
0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x75, 0x73, 0x65, 0x45, 0x66, 0x66, 0x65, 0x63, 0x74, 0x28, 0x28,
0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x63, 0x72, 0x6f, 0x6c, 0x6c,
0x20, 0x74, 0x6f, 0x20, 0x62, 0x6f, 0x74, 0x74, 0x6f, 0x6d, 0x20, 0x28,
0x69, 0x66, 0x20, 0x6e, 0x65, 0x65, 0x64, 0x65, 0x64, 0x29, 0x0a, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63,
0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x2e, 0x63, 0x75, 0x72,
0x72, 0x65, 0x6e, 0x74, 0x20, 0x26, 0x26, 0x20, 0x63, 0x6f, 0x6e, 0x74,
0x61, 0x69, 0x6e, 0x65, 0x72, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e,
0x74, 0x2e, 0x73, 0x63, 0x72, 0x6f, 0x6c, 0x6c, 0x48, 0x65, 0x69, 0x67,
0x68, 0x74, 0x20, 0x3c, 0x3d, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69,
0x6e, 0x65, 0x72, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x2e,
0x73, 0x63, 0x72, 0x6f, 0x6c, 0x6c, 0x54, 0x6f, 0x70, 0x20, 0x2b, 0x20,
0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x2e, 0x63, 0x75,
0x72, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74,
0x48, 0x65, 0x69, 0x67, 0x68, 0x74, 0x20, 0x2b, 0x20, 0x33, 0x30, 0x30,
0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x2e,
0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x73, 0x63, 0x72, 0x6f,
0x6c, 0x6c, 0x54, 0x6f, 0x28, 0x30, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74,
0x61, 0x69, 0x6e, 0x65, 0x72, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e,
0x74, 0x2e, 0x73, 0x63, 0x72, 0x6f, 0x6c, 0x6c, 0x48, 0x65, 0x69, 0x67,
0x68, 0x74, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x2c, 0x20, 0x5b,
0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x73, 0x5d, 0x29, 0x0a, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
0x63, 0x68, 0x61, 0x74, 0x4c, 0x69, 0x6e, 0x65, 0x20, 0x3d, 0x20, 0x28,
0x5b, 0x75, 0x73, 0x65, 0x72, 0x2c, 0x20, 0x6d, 0x73, 0x67, 0x5d, 0x29,
0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d,
0x6c, 0x60, 0x3c, 0x70, 0x20, 0x6b, 0x65, 0x79, 0x3d, 0x24, 0x7b, 0x6d,
0x73, 0x67, 0x7d, 0x3e, 0x3c, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e,
0x24, 0x7b, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x75,
0x73, 0x65, 0x72, 0x29, 0x7d, 0x3a, 0x3c, 0x2f, 0x73, 0x74, 0x72, 0x6f,
0x6e, 0x67, 0x3e, 0x20, 0x3c, 0x24, 0x7b, 0x4d, 0x61, 0x72, 0x6b, 0x64,
0x6f, 0x77, 0x6e, 0x7d, 0x20, 0x74, 0x65, 0x78, 0x74, 0x3d, 0x24, 0x7b,
0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x6d, 0x73, 0x67,
0x29, 0x7d, 0x20, 0x2f, 0x3e, 0x3c, 0x2f, 0x70, 0x3e, 0x60, 0x0a, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74,
0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x3c, 0x73, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x69, 0x64, 0x3d,
0x22, 0x63, 0x68, 0x61, 0x74, 0x22, 0x20, 0x72, 0x65, 0x66, 0x3d, 0x24,
0x7b, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x7d, 0x3e,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24,
0x7b, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x73, 0x2e, 0x66, 0x6c,
0x61, 0x74, 0x4d, 0x61, 0x70, 0x28, 0x63, 0x68, 0x61, 0x74, 0x4c, 0x69,
0x6e, 0x65, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x3c, 0x2f, 0x73, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x3e, 0x60,
0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x43, 0x6f, 0x6e, 0x66,
0x69, 0x67, 0x46, 0x6f, 0x72, 0x6d, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72,
0x6f, 0x70, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, 0x70,
0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x20,
0x3d, 0x20, 0x28, 0x65, 0x6c, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x73, 0x65,
0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20,
0x3d, 0x20, 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x73, 0x65, 0x73, 0x73, 0x69,
0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x5b, 0x65,
0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x6e, 0x61, 0x6d,
0x65, 0x5d, 0x3a, 0x20, 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65,
0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x7d, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, 0x70,
0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d,
0x20, 0x28, 0x65, 0x6c, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x70, 0x61, 0x72,
0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20,
0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e,
0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x5b, 0x65, 0x6c, 0x2e, 0x74,
0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x5d, 0x3a,
0x20, 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x76,
0x61, 0x6c, 0x75, 0x65, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, 0x70, 0x64, 0x61, 0x74,
0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x46, 0x6c, 0x6f, 0x61, 0x74,
0x20, 0x3d, 0x20, 0x28, 0x65, 0x6c, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x70,
0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20,
0x3d, 0x20, 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d,
0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x5b, 0x65, 0x6c,
0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x6e, 0x61, 0x6d, 0x65,
0x5d, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x73, 0x65, 0x46, 0x6c, 0x6f, 0x61,
0x74, 0x28, 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e,
0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68,
0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x3c, 0x66, 0x6f, 0x72, 0x6d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64,
0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d,
0x22, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x22, 0x3e, 0x50, 0x72, 0x6f,
0x6d, 0x70, 0x74, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x3c, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65, 0x61, 0x20,
0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x74, 0x65, 0x78, 0x74, 0x22, 0x20,
0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74,
0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73,
0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65,
0x2e, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x7d, 0x22, 0x20, 0x72, 0x6f,
0x77, 0x73, 0x3d, 0x34, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74,
0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73,
0x73, 0x69, 0x6f, 0x6e, 0x7d, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69,
0x76, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22,
0x75, 0x73, 0x65, 0x72, 0x22, 0x3e, 0x55, 0x73, 0x65, 0x72, 0x20, 0x6e,
0x61, 0x6d, 0x65, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70,
0x65, 0x3d, 0x22, 0x74, 0x65, 0x78, 0x74, 0x22, 0x20, 0x6e, 0x61, 0x6d,
0x65, 0x3d, 0x22, 0x75, 0x73, 0x65, 0x72, 0x22, 0x20, 0x76, 0x61, 0x6c,
0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f,
0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x75, 0x73, 0x65, 0x72,
0x7d, 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24,
0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69,
0x6f, 0x6e, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76,
0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x62,
0x6f, 0x74, 0x22, 0x3e, 0x42, 0x6f, 0x74, 0x20, 0x6e, 0x61, 0x6d, 0x65,
0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22,
0x74, 0x65, 0x78, 0x74, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22,
0x63, 0x68, 0x61, 0x72, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d,
0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76,
0x61, 0x6c, 0x75, 0x65, 0x2e, 0x63, 0x68, 0x61, 0x72, 0x7d, 0x22, 0x20,
0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70,
0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x7d,
0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62,
0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70,
0x6c, 0x61, 0x74, 0x65, 0x22, 0x3e, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74,
0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x3c, 0x2f, 0x6c,
0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x74, 0x65, 0x78,
0x74, 0x61, 0x72, 0x65, 0x61, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x74, 0x65,
0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65,
0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x22, 0x20,
0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x65, 0x73,
0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74,
0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x7d, 0x22, 0x20, 0x72, 0x6f,
0x77, 0x73, 0x3d, 0x34, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74,
0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73,
0x73, 0x69, 0x6f, 0x6e, 0x7d, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69,
0x76, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22,
0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x22, 0x3e, 0x43, 0x68,
0x61, 0x74, 0x20, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x20, 0x74,
0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x3c, 0x2f, 0x6c, 0x61, 0x62,
0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x74, 0x65, 0x78, 0x74, 0x61,
0x72, 0x65, 0x61, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70,
0x6c, 0x61, 0x74, 0x65, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22,
0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x54, 0x65, 0x6d, 0x70, 0x6c,
0x61, 0x74, 0x65, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22,
0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61,
0x6c, 0x75, 0x65, 0x2e, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x54,
0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x7d, 0x22, 0x20, 0x72, 0x6f,
0x77, 0x73, 0x3d, 0x31, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74,
0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73,
0x73, 0x69, 0x6f, 0x6e, 0x7d, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69,
0x76, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22,
0x74, 0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x22,
0x3e, 0x54, 0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65,
0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22,
0x72, 0x61, 0x6e, 0x67, 0x65, 0x22, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x74,
0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x22, 0x20,
0x6d, 0x69, 0x6e, 0x3d, 0x22, 0x30, 0x2e, 0x30, 0x22, 0x20, 0x6d, 0x61,
0x78, 0x3d, 0x22, 0x31, 0x2e, 0x30, 0x22, 0x20, 0x73, 0x74, 0x65, 0x70,
0x3d, 0x22, 0x30, 0x2e, 0x30, 0x31, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65,
0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72,
0x65, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b,
0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65,
0x2e, 0x74, 0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65,
0x7d, 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24,
0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d,
0x73, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x24, 0x7b, 0x70, 0x61, 0x72,
0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x65,
0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x7d, 0x3c, 0x2f,
0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e,
0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c,
0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x6e, 0x50,
0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x22, 0x3e, 0x50, 0x72, 0x65, 0x64,
0x69, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x3c, 0x2f, 0x6c, 0x61, 0x62,
0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74,
0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, 0x6e, 0x67, 0x65,
0x22, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x6e, 0x50, 0x72, 0x65, 0x64, 0x69,
0x63, 0x74, 0x22, 0x20, 0x6d, 0x69, 0x6e, 0x3d, 0x22, 0x31, 0x22, 0x20,
0x6d, 0x61, 0x78, 0x3d, 0x22, 0x32, 0x30, 0x34, 0x38, 0x22, 0x20, 0x73,
0x74, 0x65, 0x70, 0x3d, 0x22, 0x31, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65,
0x3d, 0x22, 0x6e, 0x5f, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x22,
0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x70, 0x61,
0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6e,
0x5f, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x7d, 0x22, 0x20, 0x6f,
0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64,
0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x46, 0x6c, 0x6f,
0x61, 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x70,
0x61, 0x6e, 0x3e, 0x24, 0x7b, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e,
0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6e, 0x5f, 0x70, 0x72, 0x65, 0x64,
0x69, 0x63, 0x74, 0x7d, 0x3c, 0x2f, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76,
0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66,
0x6f, 0x72, 0x3d, 0x22, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x70,
0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x22, 0x3e, 0x50, 0x65, 0x6e, 0x61,
0x6c, 0x69, 0x7a, 0x65, 0x20, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x20,
0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x63, 0x65, 0x3c, 0x2f, 0x6c, 0x61,
0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70, 0x75,
0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, 0x6e, 0x67,
0x65, 0x22, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x72, 0x65, 0x70, 0x65, 0x61,
0x74, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x22, 0x20, 0x6d,
0x69, 0x6e, 0x3d, 0x22, 0x30, 0x2e, 0x30, 0x22, 0x20, 0x6d, 0x61, 0x78,
0x3d, 0x22, 0x32, 0x2e, 0x30, 0x22, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3d,
0x22, 0x30, 0x2e, 0x30, 0x31, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d,
0x22, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x70, 0x65, 0x6e, 0x61,
0x6c, 0x74, 0x79, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22,
0x24, 0x7b, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c,
0x75, 0x65, 0x2e, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x70, 0x65,
0x6e, 0x61, 0x6c, 0x74, 0x79, 0x7d, 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e,
0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65,
0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x7d,
0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x3e,
0x24, 0x7b, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c,
0x75, 0x65, 0x2e, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x70, 0x65,
0x6e, 0x61, 0x6c, 0x74, 0x79, 0x7d, 0x3c, 0x2f, 0x73, 0x70, 0x61, 0x6e,
0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64,
0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c,
0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74,
0x5f, 0x6c, 0x61, 0x73, 0x74, 0x5f, 0x6e, 0x22, 0x3e, 0x43, 0x6f, 0x6e,
0x73, 0x69, 0x64, 0x65, 0x72, 0x20, 0x4e, 0x20, 0x74, 0x6f, 0x6b, 0x65,
0x6e, 0x73, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x70, 0x65, 0x6e, 0x61, 0x6c,
0x69, 0x7a, 0x65, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70,
0x65, 0x3d, 0x22, 0x72, 0x61, 0x6e, 0x67, 0x65, 0x22, 0x20, 0x69, 0x64,
0x3d, 0x22, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x6c, 0x61, 0x73,
0x74, 0x5f, 0x6e, 0x22, 0x20, 0x6d, 0x69, 0x6e, 0x3d, 0x22, 0x30, 0x2e,
0x30, 0x22, 0x20, 0x6d, 0x61, 0x78, 0x3d, 0x22, 0x32, 0x30, 0x34, 0x38,
0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x72, 0x65, 0x70, 0x65,
0x61, 0x74, 0x5f, 0x6c, 0x61, 0x73, 0x74, 0x5f, 0x6e, 0x22, 0x20, 0x76,
0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x70, 0x61, 0x72, 0x61,
0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x72, 0x65, 0x70,
0x65, 0x61, 0x74, 0x5f, 0x6c, 0x61, 0x73, 0x74, 0x5f, 0x6e, 0x7d, 0x22,
0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75,
0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x46,
0x6c, 0x6f, 0x61, 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
0x73, 0x70, 0x61, 0x6e, 0x3e, 0x24, 0x7b, 0x70, 0x61, 0x72, 0x61, 0x6d,
0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x72, 0x65, 0x70, 0x65,
0x61, 0x74, 0x5f, 0x6c, 0x61, 0x73, 0x74, 0x5f, 0x6e, 0x7d, 0x3c, 0x2f,
0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e,
0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x6f,
0x72, 0x6d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
0x4d, 0x61, 0x72, 0x6b, 0x64, 0x6f, 0x77, 0x6e, 0x20, 0x3d, 0x20, 0x28,
0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b,
0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6d, 0x64, 0x20,
0x3d, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x74, 0x65, 0x78,
0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61,
0x63, 0x65, 0x28, 0x2f, 0x5e, 0x23, 0x7b, 0x31, 0x2c, 0x36, 0x7d, 0x20,
0x28, 0x2e, 0x2a, 0x29, 0x24, 0x2f, 0x67, 0x69, 0x6d, 0x2c, 0x20, 0x27,
0x3c, 0x68, 0x33, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x68, 0x33, 0x3e, 0x27,
0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61,
0x63, 0x65, 0x28, 0x2f, 0x5c, 0x2a, 0x5c, 0x2a, 0x28, 0x2e, 0x2a, 0x3f,
0x29, 0x5c, 0x2a, 0x5c, 0x2a, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x73,
0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x73, 0x74,
0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5f, 0x5f,
0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5f, 0x5f, 0x2f, 0x67, 0x2c, 0x20, 0x27,
0x3c, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x24, 0x31, 0x3c, 0x2f,
0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f,
0x5c, 0x2a, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5c, 0x2a, 0x2f, 0x67, 0x2c,
0x20, 0x27, 0x3c, 0x65, 0x6d, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x65, 0x6d,
0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70,
0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5f, 0x28, 0x2e, 0x2a, 0x3f, 0x29,
0x5f, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x65, 0x6d, 0x3e, 0x24, 0x31,
0x3c, 0x2f, 0x65, 0x6d, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x60, 0x60,
0x60, 0x2e, 0x2a, 0x3f, 0x5c, 0x6e, 0x28, 0x5b, 0x5c, 0x73, 0x5c, 0x53,
0x5d, 0x2a, 0x3f, 0x29, 0x60, 0x60, 0x60, 0x2f, 0x67, 0x2c, 0x20, 0x27,
0x3c, 0x70, 0x72, 0x65, 0x3e, 0x3c, 0x63, 0x6f, 0x64, 0x65, 0x3e, 0x24,
0x31, 0x3c, 0x2f, 0x63, 0x6f, 0x64, 0x65, 0x3e, 0x3c, 0x2f, 0x70, 0x72,
0x65, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65,
0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x60, 0x28, 0x2e, 0x2a, 0x3f,
0x29, 0x60, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x63, 0x6f, 0x64, 0x65,
0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x63, 0x6f, 0x64, 0x65, 0x3e, 0x27, 0x29,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63,
0x65, 0x28, 0x2f, 0x5c, 0x6e, 0x2f, 0x67, 0x69, 0x6d, 0x2c, 0x20, 0x27,
0x3c, 0x62, 0x72, 0x20, 0x2f, 0x3e, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20,
0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60,
0x3c, 0x73, 0x70, 0x61, 0x6e, 0x20, 0x64, 0x61, 0x6e, 0x67, 0x65, 0x72,
0x6f, 0x75, 0x73, 0x6c, 0x79, 0x53, 0x65, 0x74, 0x49, 0x6e, 0x6e, 0x65,
0x72, 0x48, 0x54, 0x4d, 0x4c, 0x3d, 0x24, 0x7b, 0x7b, 0x20, 0x5f, 0x5f,
0x68, 0x74, 0x6d, 0x6c, 0x3a, 0x20, 0x6d, 0x64, 0x20, 0x7d, 0x7d, 0x20,
0x2f, 0x3e, 0x60, 0x3b, 0x0a, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x41, 0x70,
0x70, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x3c, 0x64, 0x69, 0x76, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x63, 0x6f,
0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x22, 0x3e, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x68, 0x65, 0x61, 0x64, 0x65,
0x72, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x3c, 0x68, 0x31, 0x3e, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63,
0x70, 0x70, 0x3c, 0x2f, 0x68, 0x31, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x68, 0x65, 0x61, 0x64, 0x65, 0x72,
0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
0x6d, 0x61, 0x69, 0x6e, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x63, 0x6f, 0x6e,
0x74, 0x65, 0x6e, 0x74, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x24, 0x7b, 0x63, 0x68, 0x61, 0x74,
0x53, 0x74, 0x61, 0x72, 0x74, 0x65, 0x64, 0x2e, 0x76, 0x61, 0x6c, 0x75,
0x65, 0x20, 0x3f, 0x20, 0x43, 0x68, 0x61, 0x74, 0x4c, 0x6f, 0x67, 0x20,
0x3a, 0x20, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x46, 0x6f, 0x72, 0x6d,
0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x3c, 0x2f, 0x6d, 0x61, 0x69, 0x6e, 0x3e, 0x0a, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x6f, 0x6f, 0x74, 0x65,
0x72, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x77, 0x72, 0x69, 0x74, 0x65, 0x22,
0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x3c, 0x24, 0x7b, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x49, 0x6e,
0x70, 0x75, 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x6f, 0x6f, 0x74, 0x65, 0x72,
0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
0x66, 0x6f, 0x6f, 0x74, 0x65, 0x72, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x70, 0x3e, 0x50, 0x6f, 0x77,
0x65, 0x72, 0x65, 0x64, 0x20, 0x62, 0x79, 0x20, 0x3c, 0x61, 0x20, 0x68,
0x72, 0x65, 0x66, 0x3d, 0x22, 0x68, 0x74, 0x74, 0x70, 0x73, 0x3a, 0x2f,
0x2f, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f,
0x67, 0x67, 0x65, 0x72, 0x67, 0x61, 0x6e, 0x6f, 0x76, 0x2f, 0x6c, 0x6c,
0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, 0x22, 0x3e, 0x6c, 0x6c, 0x61,
0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, 0x3c, 0x2f, 0x61, 0x3e, 0x20, 0x61,
0x6e, 0x64, 0x20, 0x3c, 0x61, 0x20, 0x68, 0x72, 0x65, 0x66, 0x3d, 0x22,
0x68, 0x74, 0x74, 0x70, 0x73, 0x3a, 0x2f, 0x2f, 0x67, 0x67, 0x6d, 0x6c,
0x2e, 0x61, 0x69, 0x22, 0x3e, 0x67, 0x67, 0x6d, 0x6c, 0x2e, 0x61, 0x69,
0x3c, 0x2f, 0x61, 0x3e, 0x3c, 0x2f, 0x70, 0x3e, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x6f, 0x6f, 0x74, 0x65,
0x72, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64,
0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x3b,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x28, 0x68, 0x28, 0x41, 0x70, 0x70,
0x29, 0x2c, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e,
0x62, 0x6f, 0x64, 0x79, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x3c, 0x2f, 0x73,
0x63, 0x72, 0x69, 0x70, 0x74, 0x3e, 0x0a, 0x3c, 0x2f, 0x68, 0x65, 0x61,
0x64, 0x3e, 0x0a, 0x0a, 0x3c, 0x62, 0x6f, 0x64, 0x79, 0x3e, 0x0a, 0x3c,
0x2f, 0x62, 0x6f, 0x64, 0x79, 0x3e, 0x0a, 0x0a, 0x3c, 0x2f, 0x68, 0x74,
0x6d, 0x6c, 0x3e, 0x0a
};
unsigned int index_html_len = 10108;

1851
examples/server/index.js.hpp Normal file

File diff suppressed because it is too large Load Diff

24596
examples/server/json.hpp Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,81 @@
const paramDefaults = {
stream: true,
n_predict: 500,
temperature: 0.2,
stop: ["</s>"]
};
/**
* This function completes the input text using a llama dictionary.
* @param {object} params - The parameters for the completion request.
* @param {object} controller - an instance of AbortController if you need one, or null.
* @param {function} callback - The callback function to call when the completion is done.
* @returns {string} the completed text as a string. Ideally ignored, and you get at it via the callback.
*/
export const llamaComplete = async (params, controller, callback) => {
if (!controller) {
controller = new AbortController();
}
const completionParams = { ...paramDefaults, ...params };
// we use fetch directly here becasue the built in fetchEventSource does not support POST
const response = await fetch("/completion", {
method: 'POST',
body: JSON.stringify(completionParams),
headers: {
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Accept': 'text/event-stream'
},
signal: controller.signal,
});
const reader = response.body.getReader();
const decoder = new TextDecoder();
let content = "";
try {
let cont = true;
while (cont) {
const result = await reader.read();
if (result.done) {
break;
}
// sse answers in the form multiple lines of: value\n with data always present as a key. in our case we
// mainly care about the data: key here, which we expect as json
const text = decoder.decode(result.value);
// parse all sse events and add them to result
const regex = /^(\S+):\s(.*)$/gm;
for (const match of text.matchAll(regex)) {
result[match[1]] = match[2]
}
// since we know this is llama.cpp, let's just decode the json in data
result.data = JSON.parse(result.data);
content += result.data.content;
// callack
if (callback) {
cont = callback(result) != false;
}
// if we got a stop token from server, we will break here
if (result.data.stop) {
break;
}
}
} catch (e) {
console.error("llama error: ", e);
throw e;
}
finally {
controller.abort();
}
return content;
}

View File

@@ -0,0 +1,359 @@
<html>
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
<title>llama.cpp - chat</title>
<style>
body {
background-color: #fff;
color: #000;
font-family: system-ui;
font-size: 90%;
}
#container {
margin: 0em auto;
display: flex;
flex-direction: column;
justify-content: space-between;
height: 100%;
}
header, footer {
text-align: center;
}
main {
margin: 3px;
display: flex;
flex-direction: column;
justify-content: space-between;
gap: 1em;
flex-grow: 1;
overflow-y: auto;
border: 1px solid #ccc;
border-radius: 5px;
padding: 0.5em;
}
body {
max-width: 600px;
min-width: 300px;
line-height: 1.2;
margin: 0 auto;
padding: 0 0.5em;
}
p {
overflow-wrap: break-word;
word-wrap: break-word;
hyphens: auto;
margin-top: 0.5em;
margin-bottom: 0.5em;
}
#write form {
margin: 1em 0 0 0;
display: flex;
flex-direction: column;
gap: 0.5em;
align-items: stretch;
}
.right {
display: flex;
flex-direction: row;
gap: 0.5em;
justify-content: flex-end;
}
fieldset {
border: none;
padding: 0;
margin: 0;
}
textarea {
padding: 5px;
flex-grow: 1;
width: 100%;
}
pre code {
display: block;
background-color: #222;
color: #ddd;
}
code {
font-family: monospace;
padding: 0.1em 0.3em;
border-radius: 3px;
}
fieldset label {
margin: 0.5em 0;
display: block;
}
</style>
<script type="module">
import {
html, h, signal, effect, computed, render, useSignal, useEffect, useRef
} from '/index.js';
import { llamaComplete } from '/completion.js';
const session = signal({
prompt: "This is a conversation between user and llama, a friendly chatbot. respond in markdown.",
template: "{{prompt}}\n\n{{history}}\n{{char}}:",
historyTemplate: "{{name}}: {{message}}",
transcript: [],
type: "chat",
char: "llama",
user: "User",
})
const transcriptUpdate = (transcript) => {
session.value = {
...session.value,
transcript
}
}
const chatStarted = computed(() => session.value.transcript.length > 0)
const params = signal({
n_predict: 400,
temperature: 0.7,
repeat_last_n: 256,
repeat_penalty: 1.18,
top_k: 40,
top_p: 0.5,
})
const controller = signal(null)
const generating = computed(() => controller.value == null )
// simple template replace
const template = (str, extraSettings) => {
let settings = session.value;
if (extraSettings) {
settings = { ...settings, ...extraSettings };
}
return String(str).replaceAll(/\{\{(.*?)\}\}/g, (_, key) => template(settings[key]));
}
// send message to server
const chat = async (msg) => {
if (controller.value) {
console.log('already running...');
return;
}
controller.value = new AbortController();
transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])
const payload = template(session.value.template, {
message: msg,
history: session.value.transcript.flatMap(([name, message]) => template(session.value.historyTemplate, {name, message})).join("\n"),
});
let currentMessage = '';
const history = session.value.transcript
const llamaParams = {
...params.value,
prompt: payload,
stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
}
await llamaComplete(llamaParams, controller.value, (message) => {
const data = message.data;
currentMessage += data.content;
// remove leading whitespace
currentMessage = currentMessage.replace(/^\s+/, "")
transcriptUpdate([...history, ["{{char}}", currentMessage]])
if (data.stop) {
console.log("-->", data, ' response was:', currentMessage, 'transcript state:', session.value.transcript);
}
})
controller.value = null;
}
function MessageInput() {
const message = useSignal("")
const stop = (e) => {
e.preventDefault();
if (controller.value) {
controller.value.abort();
controller.value = null;
}
}
const reset = (e) => {
stop(e);
transcriptUpdate([]);
}
const submit = (e) => {
stop(e);
chat(message.value);
message.value = "";
}
const enterSubmits = (event) => {
if (event.which === 13 && !event.shiftKey) {
submit(event);
}
}
return html`
<form onsubmit=${submit}>
<div>
<textarea type="text" rows=2 onkeypress=${enterSubmits} value="${message}" oninput=${(e) => message.value = e.target.value} placeholder="Say something..."/>
</div>
<div class="right">
<button type="submit" disabled=${!generating.value} >Send</button>
<button onclick=${stop} disabled=${generating}>Stop</button>
<button onclick=${reset}>Reset</button>
</div>
</form>
`
}
const ChatLog = (props) => {
const messages = session.value.transcript;
const container = useRef(null)
useEffect(() => {
// scroll to bottom (if needed)
if (container.current && container.current.scrollHeight <= container.current.scrollTop + container.current.offsetHeight + 300) {
container.current.scrollTo(0, container.current.scrollHeight)
}
}, [messages])
const chatLine = ([user, msg]) => {
return html`<p key=${msg}><strong>${template(user)}:</strong> <${Markdown} text=${template(msg)} /></p>`
};
return html`
<section id="chat" ref=${container}>
${messages.flatMap(chatLine)}
</section>`;
};
const ConfigForm = (props) => {
const updateSession = (el) => session.value = { ...session.value, [el.target.name]: el.target.value }
const updateParams = (el) => params.value = { ...params.value, [el.target.name]: el.target.value }
const updateParamsFloat = (el) => params.value = { ...params.value, [el.target.name]: parseFloat(el.target.value) }
return html`
<form>
<fieldset>
<div>
<label for="prompt">Prompt</label>
<textarea type="text" name="prompt" value="${session.value.prompt}" rows=4 oninput=${updateSession}/>
</div>
<div>
<label for="user">User name</label>
<input type="text" name="user" value="${session.value.user}" oninput=${updateSession} />
</div>
<div>
<label for="bot">Bot name</label>
<input type="text" name="char" value="${session.value.char}" oninput=${updateSession} />
</div>
<div>
<label for="template">Prompt template</label>
<textarea id="template" name="template" value="${session.value.template}" rows=4 oninput=${updateSession}/>
</div>
<div>
<label for="template">Chat history template</label>
<textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
</div>
<div>
<label for="temperature">Temperature</label>
<input type="range" id="temperature" min="0.0" max="1.0" step="0.01" name="temperature" value="${params.value.temperature}" oninput=${updateParamsFloat} />
<span>${params.value.temperature}</span>
</div>
<div>
<label for="nPredict">Predictions</label>
<input type="range" id="nPredict" min="1" max="2048" step="1" name="n_predict" value="${params.value.n_predict}" oninput=${updateParamsFloat} />
<span>${params.value.n_predict}</span>
</div>
<div>
<label for="repeat_penalty">Penalize repeat sequence</label>
<input type="range" id="repeat_penalty" min="0.0" max="2.0" step="0.01" name="repeat_penalty" value="${params.value.repeat_penalty}" oninput=${updateParamsFloat} />
<span>${params.value.repeat_penalty}</span>
</div>
<div>
<label for="repeat_last_n">Consider N tokens for penalize</label>
<input type="range" id="repeat_last_n" min="0.0" max="2048" name="repeat_last_n" value="${params.value.repeat_last_n}" oninput=${updateParamsFloat} />
<span>${params.value.repeat_last_n}</span>
</div>
</fieldset>
</form>
`
}
const Markdown = (params) => {
const md = params.text
.replace(/^#{1,6} (.*)$/gim, '<h3>$1</h3>')
.replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>')
.replace(/__(.*?)__/g, '<strong>$1</strong>')
.replace(/\*(.*?)\*/g, '<em>$1</em>')
.replace(/_(.*?)_/g, '<em>$1</em>')
.replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>')
.replace(/`(.*?)`/g, '<code>$1</code>')
.replace(/\n/gim, '<br />');
return html`<span dangerouslySetInnerHTML=${{ __html: md }} />`;
};
function App(props) {
return html`
<div id="container">
<header>
<h1>llama.cpp</h1>
</header>
<main id="content">
<${chatStarted.value ? ChatLog : ConfigForm} />
</main>
<footer id="write">
<${MessageInput} />
</footer>
<footer>
<p>Powered by <a href="https://github.com/ggerganov/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a></p>
</footer>
</div>
`;
}
render(h(App), document.body);
</script>
</head>
<body>
</body>
</html>

File diff suppressed because one or more lines are too long

1124
examples/server/server.cpp Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,7 @@
set(TARGET simple)
add_executable(${TARGET} simple.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

179
examples/simple/simple.cpp Normal file
View File

@@ -0,0 +1,179 @@
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include "common.h"
#include "llama.h"
#include "build-info.h"
#include <cassert>
#include <cinttypes>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
#include <signal.h>
#include <unistd.h>
#elif defined (_WIN32)
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
#include <signal.h>
#endif
int main(int argc, char ** argv)
{
gpt_params params;
//---------------------------------
// Print help :
//---------------------------------
if ( argc == 1 || argv[1][0] == '-' )
{
printf( "usage: %s MODEL_PATH [PROMPT]\n" , argv[0] );
return 1 ;
}
//---------------------------------
// Load parameters :
//---------------------------------
if ( argc >= 2 )
{
params.model = argv[1];
}
if ( argc >= 3 )
{
params.prompt = argv[2];
}
if ( params.prompt.empty() )
{
params.prompt = "Hello my name is";
}
//---------------------------------
// Init LLM :
//---------------------------------
llama_init_backend(params.numa);
llama_model * model;
llama_context * ctx;
std::tie(model, ctx) = llama_init_from_gpt_params( params );
if ( model == NULL )
{
fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
return 1;
}
//---------------------------------
// Tokenize the prompt :
//---------------------------------
std::vector<llama_token> tokens_list;
tokens_list = ::llama_tokenize( ctx , params.prompt , true );
const int max_context_size = llama_n_ctx( ctx );
const int max_tokens_list_size = max_context_size - 4 ;
if ( (int)tokens_list.size() > max_tokens_list_size )
{
fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" ,
__func__ , (int)tokens_list.size() , max_tokens_list_size );
return 1;
}
fprintf( stderr, "\n\n" );
// Print the tokens from the prompt :
for( auto id : tokens_list )
{
printf( "%s" , llama_token_to_str( ctx , id ) );
}
fflush(stdout);
//---------------------------------
// Main prediction loop :
//---------------------------------
// The LLM keeps a contextual cache memory of previous token evaluation.
// Usually, once this cache is full, it is required to recompute a compressed context based on previous
// tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
// example, we will just stop the loop once this cache is full or once an end of stream is detected.
while ( llama_get_kv_cache_token_count( ctx ) < max_context_size )
{
//---------------------------------
// Evaluate the tokens :
//---------------------------------
if ( llama_eval( ctx , tokens_list.data() , tokens_list.size() , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) )
{
fprintf( stderr, "%s : failed to eval\n" , __func__ );
return 1;
}
tokens_list.clear();
//---------------------------------
// Select the best prediction :
//---------------------------------
llama_token new_token_id = 0;
auto logits = llama_get_logits( ctx );
auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens)
std::vector<llama_token_data> candidates;
candidates.reserve( n_vocab );
for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ )
{
candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } );
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
// Select it using the "Greedy sampling" method :
new_token_id = llama_sample_token_greedy( ctx , &candidates_p );
// is it an end of stream ?
if ( new_token_id == llama_token_eos() )
{
fprintf(stderr, " [end of text]\n");
break;
}
// Print the new token :
printf( "%s" , llama_token_to_str( ctx , new_token_id ) );
fflush( stdout );
// Push this new token for next evaluation :
tokens_list.push_back( new_token_id );
} // wend of main loop
llama_free( ctx );
llama_free_model( model );
return 0;
}
// EOF

View File

@@ -0,0 +1,4 @@
set(TARGET train-text-from-scratch)
add_executable(${TARGET} train-text-from-scratch.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@@ -0,0 +1,22 @@
# train-text-from-scratch
Basic usage instructions:
```bash
# get training data
wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
# train
./bin/train-text-from-scratch \
--vocab-model ../models/ggml-vocab.bin \
--ctx 64 --embd 256 --head 8 --layer 16 \
--checkpoint-in chk-shakespeare-256x16.bin \
--checkpoint-out chk-shakespeare-256x16.bin \
--model-out ggml-shakespeare-256x16-f32.bin \
--train-data "shakespeare.txt" \
-t 6 -b 16 -n 32 --seed 1 --adam-iter 16 \
--print-details-interval 0 --predict 16 --use-flash
# predict
./bin/main -m ggml-shakespeare-256x16-f32.bin
```

File diff suppressed because it is too large Load Diff

30
flake.lock generated
View File

@@ -1,12 +1,15 @@
{
"nodes": {
"flake-utils": {
"inputs": {
"systems": "systems"
},
"locked": {
"lastModified": 1676283394,
"narHash": "sha256-XX2f9c3iySLCw54rJ/CZs+ZK6IQy7GXNY4nSOyu2QG4=",
"lastModified": 1685518550,
"narHash": "sha256-o2d0KcvaXzTrPRIo0kOLV0/QXHhDQ5DTi+OxcjO8xqY=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "3db36a8b464d0c4532ba1c7dda728f4576d6d073",
"rev": "a1720a10a6cfe8234c0e93907ffe81be440f4cef",
"type": "github"
},
"original": {
@@ -17,11 +20,11 @@
},
"nixpkgs": {
"locked": {
"lastModified": 1678470307,
"narHash": "sha256-OEeMUr3ueLIXyW/OaFUX5jUdimyQwMg/7e+/Q0gC/QE=",
"lastModified": 1685931219,
"narHash": "sha256-8EWeOZ6LKQfgAjB/USffUSELPRjw88A+xTcXnOUvO5M=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "0c4800d579af4ed98ecc47d464a5e7b0870c4b1f",
"rev": "7409480d5c8584a1a83c422530419efe4afb0d19",
"type": "github"
},
"original": {
@@ -36,6 +39,21 @@
"flake-utils": "flake-utils",
"nixpkgs": "nixpkgs"
}
},
"systems": {
"locked": {
"lastModified": 1681028828,
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
"owner": "nix-systems",
"repo": "default",
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
"type": "github"
},
"original": {
"owner": "nix-systems",
"repo": "default",
"type": "github"
}
}
},
"root": "root",

View File

@@ -6,44 +6,69 @@
outputs = { self, nixpkgs, flake-utils }:
flake-utils.lib.eachDefaultSystem (system:
let
pkgs = import nixpkgs {
inherit system;
};
llama-python = pkgs.python310.withPackages (ps: with ps; [
torch
numpy
sentencepiece
]);
in
{
inherit (pkgs.stdenv) isAarch64 isDarwin;
inherit (pkgs.lib) optionals;
isM1 = isAarch64 && isDarwin;
osSpecific = if isM1 then
with pkgs.darwin.apple_sdk_11_0.frameworks; [
Accelerate
MetalKit
MetalPerformanceShaders
MetalPerformanceShadersGraph
]
else if isDarwin then
with pkgs.darwin.apple_sdk.frameworks; [
Accelerate
CoreGraphics
CoreVideo
]
else
[ ];
pkgs = import nixpkgs { inherit system; };
llama-python =
pkgs.python310.withPackages (ps: with ps; [ numpy sentencepiece ]);
in {
packages.default = pkgs.stdenv.mkDerivation {
name = "llama.cpp";
src = ./.;
postPatch = if isM1 then ''
substituteInPlace ./ggml-metal.m \
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
'' else
"";
nativeBuildInputs = with pkgs; [ cmake ];
buildInputs = with pkgs; lib.optionals stdenv.isDarwin [
darwin.apple_sdk.frameworks.Accelerate
];
cmakeFlags = with pkgs; lib.optionals (system == "aarch64-darwin") [
buildInputs = osSpecific;
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" ] ++ (optionals isM1 [
"-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
];
"-DLLAMA_METAL=ON"
]);
installPhase = ''
mkdir -p $out/bin
mv bin/main $out/bin/llama
mv bin/quantize $out/bin/quantize
echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml
cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml
chmod +x $out/bin/convert-pth-to-ggml
mv bin/* $out/bin/
mv $out/bin/main $out/bin/llama
mv $out/bin/server $out/bin/llama-server
echo "#!${llama-python}/bin/python" > $out/bin/convert.py
cat ${./convert.py} >> $out/bin/convert.py
chmod +x $out/bin/convert.py
'';
meta.mainProgram = "llama";
};
devShells.default = pkgs.mkShell {
packages = with pkgs; [
cmake
llama-python
] ++ lib.optionals stdenv.isDarwin [
darwin.apple_sdk.frameworks.Accelerate
];
apps.llama-server = {
type = "app";
program = "${self.packages.${system}.default}/bin/llama-server";
};
}
);
apps.llama-embedding = {
type = "app";
program = "${self.packages.${system}.default}/bin/embedding";
};
apps.llama = {
type = "app";
program = "${self.packages.${system}.default}/bin/llama";
};
apps.default = self.apps.${system}.llama;
devShells.default = pkgs.mkShell {
packages = with pkgs; [ cmake llama-python ] ++ osSpecific;
};
});
}

3050
ggml-cuda.cu Normal file

File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More