mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-26 14:23:22 +02:00
Compare commits
13 Commits
master-777
...
master-cf6
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cf658adc83 | ||
|
|
a192860cfe | ||
|
|
95385241a9 | ||
|
|
335acd2ffd | ||
|
|
5290c38e6e | ||
|
|
cc34dbda96 | ||
|
|
7c2227a197 | ||
|
|
f19dca04ea | ||
|
|
8207214b6a | ||
|
|
62959e740e | ||
|
|
7f7ddd5002 | ||
|
|
b8ad1b66b2 | ||
|
|
f5fe98d11b |
58
.devops/lamma-cpp-clblast.srpm.spec
Normal file
58
.devops/lamma-cpp-clblast.srpm.spec
Normal file
@@ -0,0 +1,58 @@
|
||||
# SRPM for building from source and packaging an RPM for RPM-based distros.
|
||||
# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
|
||||
# Built and maintained by John Boero - boeroboy@gmail.com
|
||||
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
|
||||
|
||||
# Notes for llama.cpp:
|
||||
# 1. Tags are currently based on hash - which will not sort asciibetically.
|
||||
# We need to declare standard versioning if people want to sort latest releases.
|
||||
# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
|
||||
# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
|
||||
# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
|
||||
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
|
||||
# It is up to the user to install the correct vendor-specific support.
|
||||
|
||||
Name: llama.cpp-clblast
|
||||
Version: master
|
||||
Release: 1%{?dist}
|
||||
Summary: OpenCL Inference of LLaMA model in pure C/C++
|
||||
License: MIT
|
||||
Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
|
||||
BuildRequires: coreutils make gcc-c++ git mesa-libOpenCL-devel
|
||||
URL: https://github.com/ggerganov/llama.cpp
|
||||
|
||||
%define debug_package %{nil}
|
||||
%define source_date_epoch_from_changelog 0
|
||||
|
||||
%description
|
||||
CPU inference for Meta's Lllama2 models using default options.
|
||||
|
||||
%prep
|
||||
%setup -n llama.cpp-master
|
||||
|
||||
%build
|
||||
make -j LLAMA_CLBLAST=1
|
||||
|
||||
%install
|
||||
mkdir -p %{buildroot}%{_bindir}/
|
||||
cp -p main %{buildroot}%{_bindir}/llamacppclblast
|
||||
cp -p server %{buildroot}%{_bindir}/llamacppclblastserver
|
||||
cp -p simple %{buildroot}%{_bindir}/llamacppclblastsimple
|
||||
|
||||
%clean
|
||||
rm -rf %{buildroot}
|
||||
rm -rf %{_builddir}/*
|
||||
|
||||
%files
|
||||
%{_bindir}/llamacppclblast
|
||||
%{_bindir}/llamacppclblastserver
|
||||
%{_bindir}/llamacppclblastsimple
|
||||
|
||||
%pre
|
||||
|
||||
%post
|
||||
|
||||
%preun
|
||||
%postun
|
||||
|
||||
%changelog
|
||||
59
.devops/lamma-cpp-cublas.srpm.spec
Normal file
59
.devops/lamma-cpp-cublas.srpm.spec
Normal file
@@ -0,0 +1,59 @@
|
||||
# SRPM for building from source and packaging an RPM for RPM-based distros.
|
||||
# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
|
||||
# Built and maintained by John Boero - boeroboy@gmail.com
|
||||
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
|
||||
|
||||
# Notes for llama.cpp:
|
||||
# 1. Tags are currently based on hash - which will not sort asciibetically.
|
||||
# We need to declare standard versioning if people want to sort latest releases.
|
||||
# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
|
||||
# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
|
||||
# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
|
||||
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
|
||||
# It is up to the user to install the correct vendor-specific support.
|
||||
|
||||
Name: llama.cpp-cublas
|
||||
Version: master
|
||||
Release: 1%{?dist}
|
||||
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
|
||||
License: MIT
|
||||
Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
|
||||
BuildRequires: coreutils make gcc-c++ git cuda-toolkit
|
||||
Requires: cuda-toolkit
|
||||
URL: https://github.com/ggerganov/llama.cpp
|
||||
|
||||
%define debug_package %{nil}
|
||||
%define source_date_epoch_from_changelog 0
|
||||
|
||||
%description
|
||||
CPU inference for Meta's Lllama2 models using default options.
|
||||
|
||||
%prep
|
||||
%setup -n llama.cpp-master
|
||||
|
||||
%build
|
||||
make -j LLAMA_CUBLAS=1
|
||||
|
||||
%install
|
||||
mkdir -p %{buildroot}%{_bindir}/
|
||||
cp -p main %{buildroot}%{_bindir}/llamacppcublas
|
||||
cp -p server %{buildroot}%{_bindir}/llamacppcublasserver
|
||||
cp -p simple %{buildroot}%{_bindir}/llamacppcublassimple
|
||||
|
||||
%clean
|
||||
rm -rf %{buildroot}
|
||||
rm -rf %{_builddir}/*
|
||||
|
||||
%files
|
||||
%{_bindir}/llamacppcublas
|
||||
%{_bindir}/llamacppcublasserver
|
||||
%{_bindir}/llamacppcublassimple
|
||||
|
||||
%pre
|
||||
|
||||
%post
|
||||
|
||||
%preun
|
||||
%postun
|
||||
|
||||
%changelog
|
||||
58
.devops/llama-cpp.srpm.spec
Normal file
58
.devops/llama-cpp.srpm.spec
Normal file
@@ -0,0 +1,58 @@
|
||||
# SRPM for building from source and packaging an RPM for RPM-based distros.
|
||||
# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
|
||||
# Built and maintained by John Boero - boeroboy@gmail.com
|
||||
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
|
||||
|
||||
# Notes for llama.cpp:
|
||||
# 1. Tags are currently based on hash - which will not sort asciibetically.
|
||||
# We need to declare standard versioning if people want to sort latest releases.
|
||||
# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
|
||||
# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
|
||||
# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
|
||||
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
|
||||
# It is up to the user to install the correct vendor-specific support.
|
||||
|
||||
Name: llama.cpp
|
||||
Version: master
|
||||
Release: 1%{?dist}
|
||||
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
|
||||
License: MIT
|
||||
Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
|
||||
BuildRequires: coreutils make gcc-c++ git
|
||||
URL: https://github.com/ggerganov/llama.cpp
|
||||
|
||||
%define debug_package %{nil}
|
||||
%define source_date_epoch_from_changelog 0
|
||||
|
||||
%description
|
||||
CPU inference for Meta's Lllama2 models using default options.
|
||||
|
||||
%prep
|
||||
%autosetup
|
||||
|
||||
%build
|
||||
make -j
|
||||
|
||||
%install
|
||||
mkdir -p %{buildroot}%{_bindir}/
|
||||
cp -p main %{buildroot}%{_bindir}/llamacpp
|
||||
cp -p server %{buildroot}%{_bindir}/llamacppserver
|
||||
cp -p simple %{buildroot}%{_bindir}/llamacppsimple
|
||||
|
||||
%clean
|
||||
rm -rf %{buildroot}
|
||||
rm -rf %{_builddir}/*
|
||||
|
||||
%files
|
||||
%{_bindir}/llamacpp
|
||||
%{_bindir}/llamacppserver
|
||||
%{_bindir}/llamacppsimple
|
||||
|
||||
%pre
|
||||
|
||||
%post
|
||||
|
||||
%preun
|
||||
%postun
|
||||
|
||||
%changelog
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -3,6 +3,8 @@
|
||||
*.so
|
||||
*.gguf
|
||||
*.bin
|
||||
*.exe
|
||||
*.dll
|
||||
.DS_Store
|
||||
.build/
|
||||
.cache/
|
||||
@@ -81,4 +83,3 @@ tests/test-quantize-fns
|
||||
tests/test-quantize-perf
|
||||
tests/test-sampling
|
||||
tests/test-tokenizer-0
|
||||
|
||||
|
||||
12
README.md
12
README.md
@@ -39,6 +39,7 @@ Last revision compatible with the old format: [dadbed9](https://github.com/ggerg
|
||||
<li><a href="#memorydisk-requirements">Memory/Disk Requirements</a></li>
|
||||
<li><a href="#quantization">Quantization</a></li>
|
||||
<li><a href="#interactive-mode">Interactive mode</a></li>
|
||||
<li><a href="#constrained-output-with-grammars">Constrained output with grammars</a></li>
|
||||
<li><a href="#instruction-mode-with-alpaca">Instruction mode with Alpaca</a></li>
|
||||
<li><a href="#using-openllama">Using OpenLLaMA</a></li>
|
||||
<li><a href="#using-gpt4all">Using GPT4All</a></li>
|
||||
@@ -604,6 +605,16 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
|
||||
CHAT_SAVE_DIR=./chat/bob ./examples/chat-persistent.sh
|
||||
```
|
||||
|
||||
### Constrained output with grammars
|
||||
|
||||
`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
|
||||
|
||||
```bash
|
||||
./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
|
||||
```
|
||||
|
||||
The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
|
||||
|
||||
### Instruction mode with Alpaca
|
||||
|
||||
1. First, download the `ggml` Alpaca model into the `./models` folder
|
||||
@@ -885,3 +896,4 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /
|
||||
- [BLIS](./docs/BLIS.md)
|
||||
- [Performance troubleshooting](./docs/token_generation_performance_tips.md)
|
||||
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
|
||||
- [GBNF grammars](./grammars/README.md)
|
||||
|
||||
@@ -417,6 +417,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
params.antiprompt.push_back(argv[i]);
|
||||
} else if (arg == "--perplexity") {
|
||||
params.perplexity = true;
|
||||
} else if (arg == "--ppl-stride") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.ppl_stride = std::stoi(argv[i]);
|
||||
} else if (arg == "--ppl-output-type") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.ppl_output_type = std::stoi(argv[i]);
|
||||
} else if (arg == "--hellaswag") {
|
||||
params.hellaswag = true;
|
||||
} else if (arg == "--hellaswag-tasks") {
|
||||
@@ -732,35 +744,3 @@ std::string llama_token_to_str(const struct llama_context * ctx, llama_token tok
|
||||
|
||||
return std::string(result.data(), result.size());
|
||||
}
|
||||
|
||||
std::vector<llama_token> llama_tokenize_bpe(
|
||||
struct llama_context * ctx,
|
||||
const std::string & text,
|
||||
bool add_bos) {
|
||||
int n_tokens = text.length() + add_bos;
|
||||
std::vector<llama_token> result(n_tokens);
|
||||
n_tokens = llama_tokenize_bpe(ctx, text.c_str(), result.data(), result.size(), add_bos);
|
||||
if (n_tokens < 0) {
|
||||
result.resize(-n_tokens);
|
||||
int check = llama_tokenize_bpe(ctx, text.c_str(), result.data(), result.size(), add_bos);
|
||||
GGML_ASSERT(check == -n_tokens);
|
||||
} else {
|
||||
result.resize(n_tokens);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token) {
|
||||
std::vector<char> result(8, 0);
|
||||
const int n_tokens = llama_token_to_str_bpe(ctx, token, result.data(), result.size());
|
||||
if (n_tokens < 0) {
|
||||
result.resize(-n_tokens);
|
||||
const int check = llama_token_to_str_bpe(ctx, token, result.data(), result.size());
|
||||
GGML_ASSERT(check == -n_tokens);
|
||||
} else {
|
||||
result.resize(n_tokens);
|
||||
}
|
||||
|
||||
return std::string(result.data(), result.size());
|
||||
}
|
||||
|
||||
|
||||
@@ -64,6 +64,10 @@ struct gpt_params {
|
||||
std::string lora_adapter = ""; // lora adapter path
|
||||
std::string lora_base = ""; // base model path for the lora adapter
|
||||
|
||||
int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
|
||||
int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
|
||||
// (which is more convenient to use for plotting)
|
||||
//
|
||||
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
|
||||
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
|
||||
|
||||
@@ -116,15 +120,6 @@ std::vector<llama_token> llama_tokenize(
|
||||
const std::string & text,
|
||||
bool add_bos);
|
||||
|
||||
std::vector<llama_token> llama_tokenize_bpe(
|
||||
struct llama_context * ctx,
|
||||
const std::string & text,
|
||||
bool add_bos);
|
||||
|
||||
std::string llama_token_to_str(
|
||||
const struct llama_context * ctx,
|
||||
llama_token token);
|
||||
|
||||
std::string llama_token_to_str_bpe(
|
||||
const struct llama_context * ctx,
|
||||
llama_token token);
|
||||
|
||||
56
convert-falcon-hf-to-gguf.py
Normal file → Executable file
56
convert-falcon-hf-to-gguf.py
Normal file → Executable file
@@ -1,3 +1,4 @@
|
||||
#!/usr/bin/env python3
|
||||
# HF falcon--> gguf conversion
|
||||
|
||||
import gguf
|
||||
@@ -94,14 +95,17 @@ print("gguf: get model metadata")
|
||||
|
||||
block_count = hparams["n_layer"]
|
||||
|
||||
gguf_writer.add_name(last_dir)
|
||||
gguf_writer.add_name("Falcon")
|
||||
gguf_writer.add_context_length(2048) # not in config.json
|
||||
gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
|
||||
gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||
gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
|
||||
gguf_writer.add_block_count(block_count)
|
||||
gguf_writer.add_head_count(hparams["n_head"])
|
||||
if "n_head_kv" in hparams: gguf_writer.add_head_count_kv(hparams["n_head_kv"])
|
||||
if "n_head_kv" in hparams:
|
||||
gguf_writer.add_head_count_kv(hparams["n_head_kv"])
|
||||
else:
|
||||
gguf_writer.add_head_count_kv(1)
|
||||
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
|
||||
|
||||
# TOKENIZATION
|
||||
@@ -109,6 +113,8 @@ gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
|
||||
print("gguf: get tokenizer metadata")
|
||||
|
||||
tokens: List[str] = []
|
||||
scores: List[float] = []
|
||||
toktypes: List[int] = []
|
||||
merges: List[str] = []
|
||||
|
||||
|
||||
@@ -152,41 +158,30 @@ if Path(dir_model + "/tokenizer.json").is_file():
|
||||
text = bytearray(pad_token)
|
||||
|
||||
tokens.append(text)
|
||||
scores.append(0.0) # dymmy
|
||||
toktypes.append(gguf.TokenType.NORMAL) # dummy
|
||||
|
||||
gguf_writer.add_token_list(tokens)
|
||||
gguf_writer.add_token_scores(scores)
|
||||
gguf_writer.add_token_types(toktypes)
|
||||
|
||||
if "added_tokens" in tokenizer_json and Path(dir_model + "/tokenizer_config.json").is_file():
|
||||
print("gguf: get special token ids")
|
||||
print("gguf: get special token ids")
|
||||
# Look for special tokens in config.json
|
||||
|
||||
with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
|
||||
tokenizer_config = json.load(f)
|
||||
if "bos_token_id" in hparams and hparams["bos_token_id"] != None:
|
||||
gguf_writer.add_bos_token_id(hparams["bos_token_id"])
|
||||
|
||||
# find special token ids
|
||||
if "eos_token_id" in hparams and hparams["eos_token_id"] != None:
|
||||
gguf_writer.add_eos_token_id(hparams["eos_token_id"])
|
||||
|
||||
if "bos_token" in tokenizer_config:
|
||||
for key in tokenizer_json["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["bos_token"]:
|
||||
gguf_writer.add_bos_token_id(key["id"])
|
||||
if "unk_token_id" in hparams and hparams["unk_token_id"] != None:
|
||||
gguf_writer.add_unk_token_id(hparams["unk_token_id"])
|
||||
|
||||
if "eos_token" in tokenizer_config:
|
||||
for key in tokenizer_json["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["eos_token"]:
|
||||
gguf_writer.add_eos_token_id(key["id"])
|
||||
if "sep_token_id" in hparams and hparams["sep_token_id"] != None:
|
||||
gguf_writer.add_sep_token_id(hparams["sep_token_id"])
|
||||
|
||||
if "unk_token" in tokenizer_config:
|
||||
for key in tokenizer_json["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["unk_token"]:
|
||||
gguf_writer.add_unk_token_id(key["id"])
|
||||
|
||||
if "sep_token" in tokenizer_config:
|
||||
for key in tokenizer_json["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["sep_token"]:
|
||||
gguf_writer.add_sep_token_id(key["id"])
|
||||
|
||||
if "pad_token" in tokenizer_config:
|
||||
for key in tokenizer_json["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["pad_token"]:
|
||||
gguf_writer.add_pad_token_id(key["id"])
|
||||
if "pad_token_id" in hparams and hparams["pad_token_id"] != None:
|
||||
gguf_writer.add_pad_token_id(hparams["pad_token_id"])
|
||||
|
||||
|
||||
# TENSORS
|
||||
@@ -194,8 +189,9 @@ if Path(dir_model + "/tokenizer.json").is_file():
|
||||
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
||||
|
||||
# params for qkv transform
|
||||
n_head = hparams["n_head"]
|
||||
n_head = hparams["n_head"]
|
||||
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
|
||||
|
||||
head_dim = hparams["hidden_size"] // n_head
|
||||
|
||||
# tensor info
|
||||
|
||||
1
convert-gptneox-hf-to-gguf.py
Normal file → Executable file
1
convert-gptneox-hf-to-gguf.py
Normal file → Executable file
@@ -1,3 +1,4 @@
|
||||
#!/usr/bin/env python3
|
||||
# HF gptneox--> gguf conversion
|
||||
|
||||
import gguf
|
||||
|
||||
1
convert-llama-7b-pth-to-gguf.py
Normal file → Executable file
1
convert-llama-7b-pth-to-gguf.py
Normal file → Executable file
@@ -1,3 +1,4 @@
|
||||
#!/usr/bin/env python3
|
||||
# 7b pth llama --> gguf conversion
|
||||
# Only models with a single datafile are supported, like 7B
|
||||
# HF files required in the model dir: config.json tokenizer_config.json tokenizer.json tokenizer.model
|
||||
|
||||
13
convert-llama-ggmlv3-to-gguf.py
Normal file → Executable file
13
convert-llama-ggmlv3-to-gguf.py
Normal file → Executable file
@@ -1,12 +1,11 @@
|
||||
import sys, struct, math, argparse, warnings
|
||||
#!/usr/bin/env python3
|
||||
import sys, struct, math, argparse
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
|
||||
import gguf
|
||||
|
||||
warnings.filterwarnings('error')
|
||||
|
||||
# Note: Does not support GGML_QKK_64
|
||||
QK_K = 256
|
||||
# Items here are (block size, type size)
|
||||
@@ -95,7 +94,7 @@ class Tensor:
|
||||
pad = ((offset + 31) & ~31) - offset
|
||||
offset += pad
|
||||
n_elems = np.prod(self.dims)
|
||||
n_bytes = (n_elems * tysize) // blksize
|
||||
n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize)
|
||||
self.start_offset = offset
|
||||
self.len_bytes = n_bytes
|
||||
offset += n_bytes
|
||||
@@ -327,11 +326,7 @@ def main():
|
||||
data = np.memmap(cfg.input, mode = 'r')
|
||||
model = GGMLV3Model()
|
||||
print('* Scanning GGML input file')
|
||||
try:
|
||||
offset = model.load(data, 0)
|
||||
except OverflowError:
|
||||
print(f'!!! Caught overflow loading tensors. The most likely issue is running on Windows but not in WSL. Try running in WSL if possible.', file = sys.stderr)
|
||||
raise
|
||||
offset = model.load(data, 0)
|
||||
print(f'* GGML model hyperparameters: {model.hyperparameters}')
|
||||
vocab_override = None
|
||||
params_override = None
|
||||
|
||||
1
convert-llama-hf-to-gguf.py
Normal file → Executable file
1
convert-llama-hf-to-gguf.py
Normal file → Executable file
@@ -1,3 +1,4 @@
|
||||
#!/usr/bin/env python3
|
||||
# HF llama --> gguf conversion
|
||||
|
||||
import gguf
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python
|
||||
#!/usr/bin/env python3
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
@@ -6,23 +6,22 @@ import struct
|
||||
import sys
|
||||
from typing import Any, Dict, Sequence, TextIO
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from convert import DATA_TYPE_TO_FTYPE, NUMPY_TYPE_TO_DATA_TYPE, DataType
|
||||
NUMPY_TYPE_TO_FTYPE: Dict[str, int] = {"float32": 0, "float16": 1}
|
||||
|
||||
|
||||
HF_SUBLAYER_TO_GGML = {
|
||||
"self_attn.q_proj": "attention.wq",
|
||||
"self_attn.k_proj": "attention.wk",
|
||||
"self_attn.v_proj": "attention.wv",
|
||||
"self_attn.o_proj": "attention.wo",
|
||||
"mlp.gate_proj": "feed_forward.w1",
|
||||
"mlp.down_proj": "feed_forward.w2",
|
||||
"mlp.up_proj": "feed_forward.w3",
|
||||
"input_layernorm": "attention_norm",
|
||||
"self_attn.q_proj": "attn_q",
|
||||
"self_attn.k_proj": "attn_k",
|
||||
"self_attn.v_proj": "attn_v",
|
||||
"self_attn.o_proj": "attn_output",
|
||||
"mlp.gate_proj": "ffn_gate",
|
||||
"mlp.down_proj": "ffn_down",
|
||||
"mlp.up_proj": "ffn_up",
|
||||
"input_layernorm": "attn_norm",
|
||||
"post_attention_layernorm": "ffn_norm",
|
||||
# "norm": "norm",
|
||||
# "embed_tokens": "tok_embeddings",
|
||||
# "lm_head": "output",
|
||||
}
|
||||
|
||||
|
||||
@@ -39,7 +38,7 @@ def translate_tensor_name(t: str) -> str:
|
||||
sys.exit(1)
|
||||
|
||||
output_string = (
|
||||
f"layers.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
|
||||
f"blk.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
|
||||
)
|
||||
return output_string
|
||||
else:
|
||||
@@ -54,12 +53,14 @@ def write_file_header(fout: TextIO, params: Dict[str, Any]) -> None:
|
||||
# https://opendelta.readthedocs.io/en/latest/modules/deltas.html says that `lora_alpha` is an int
|
||||
# but some models ship a float value instead
|
||||
# let's convert to int, but fail if lossless conversion is not possible
|
||||
assert int(params["lora_alpha"]) == params["lora_alpha"], "cannot convert float to int losslessly"
|
||||
assert (
|
||||
int(params["lora_alpha"]) == params["lora_alpha"]
|
||||
), "cannot convert float to int losslessly"
|
||||
fout.write(struct.pack("i", int(params["lora_alpha"])))
|
||||
|
||||
|
||||
def write_tensor_header(
|
||||
self, name: str, shape: Sequence[int], data_type: DataType
|
||||
self, name: str, shape: Sequence[int], data_type: np.dtype
|
||||
) -> None:
|
||||
sname = name.encode("utf-8")
|
||||
fout.write(
|
||||
@@ -67,7 +68,7 @@ def write_tensor_header(
|
||||
"iii",
|
||||
len(shape),
|
||||
len(sname),
|
||||
DATA_TYPE_TO_FTYPE[NUMPY_TYPE_TO_DATA_TYPE[data_type]],
|
||||
NUMPY_TYPE_TO_FTYPE[data_type.name],
|
||||
)
|
||||
)
|
||||
fout.write(struct.pack("i" * len(shape), *shape[::-1]))
|
||||
|
||||
8
convert.py
Normal file → Executable file
8
convert.py
Normal file → Executable file
@@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import gguf
|
||||
import argparse
|
||||
@@ -733,7 +733,11 @@ class OutputFile:
|
||||
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
||||
|
||||
def add_meta_arch(self, params: Params) -> None:
|
||||
self.gguf.add_name ("LLaMA")
|
||||
ver = None
|
||||
if (params.n_ctx == 4096):
|
||||
ver = "v2"
|
||||
|
||||
self.gguf.add_name ("LLaMA" if ver == None else "LLaMA " + ver)
|
||||
self.gguf.add_context_length (params.n_ctx)
|
||||
self.gguf.add_embedding_length (params.n_embd)
|
||||
self.gguf.add_block_count (params.n_layer)
|
||||
|
||||
@@ -12,15 +12,19 @@ usage: ./convert-llama2c-to-ggml [options]
|
||||
|
||||
options:
|
||||
-h, --help show this help message and exit
|
||||
--copy-vocab-from-model FNAME model path from which to copy vocab (default 'models/ggml-vocab.bin')
|
||||
--copy-vocab-from-model FNAME model path from which to copy vocab (default 'tokenizer.bin')
|
||||
--llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model
|
||||
--llama2c-output-model FNAME model path to save the converted llama2.c model (default ak_llama_model.bin')
|
||||
```
|
||||
|
||||
An example command is as follows:
|
||||
An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows:
|
||||
|
||||
`$ ./convert-llama2c-to-ggml --copy-vocab-from-model <ggml-vocab.bin> --llama2c-model <llama2.c model path> --llama2c-output-model <ggml output model path>`
|
||||
`$ ./convert-llama2c-to-ggml --copy-vocab-from-model ../llama2.c/tokenizer.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.ggmlv3.bin`
|
||||
|
||||
Now you can use the model with command like:
|
||||
For now the generated model is in the legacy GGJTv3 format, so you need to convert it to gguf manually:
|
||||
|
||||
`$ ./main -m <ggml output model path> -p "One day, Lily met a Shoggoth" -n 500 -c 256 -eps 1e-5`
|
||||
`$ python ./convert-llama-ggmlv3-to-gguf.py --eps 1e-5 --input stories42M.ggmlv3.bin --output stories42M.gguf.bin`
|
||||
|
||||
Now you can use the model with a command like:
|
||||
|
||||
`$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
|
||||
|
||||
@@ -17,6 +17,9 @@
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
|
||||
#define LLAMA_FILE_VERSION_GGJT_V3 3
|
||||
|
||||
//////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc.
|
||||
typedef struct {
|
||||
int dim; // transformer dimension
|
||||
@@ -49,10 +52,10 @@ typedef struct {
|
||||
// float* freq_cis_real; // (seq_len, dim/2)
|
||||
// float* freq_cis_imag; // (seq_len, dim/2)
|
||||
// (optional) classifier weights for the logits, on the last layer
|
||||
//float* wcls;
|
||||
float* wcls;
|
||||
} TransformerWeights;
|
||||
|
||||
void malloc_weights(TransformerWeights* w, Config* p) {
|
||||
void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
|
||||
// we calloc instead of malloc to keep valgrind happy
|
||||
w->token_embedding_table = new float[p->vocab_size * p->dim]();
|
||||
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
|
||||
@@ -86,9 +89,16 @@ void malloc_weights(TransformerWeights* w, Config* p) {
|
||||
|
||||
w->rms_final_weight = new float[p->dim]();
|
||||
printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
|
||||
|
||||
if (shared_weights) {
|
||||
w->wcls = NULL;
|
||||
} else {
|
||||
w->wcls = new float[p->vocab_size * p->dim]();
|
||||
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
|
||||
}
|
||||
}
|
||||
|
||||
int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) {
|
||||
int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) {
|
||||
if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
|
||||
if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
|
||||
if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
|
||||
@@ -100,6 +110,22 @@ int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) {
|
||||
if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1;
|
||||
if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
|
||||
if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1;
|
||||
|
||||
// Skip freq_cis_real & freq_cis_imag
|
||||
int head_size = p->dim / p->n_heads;
|
||||
fseek(f, p->seq_len * head_size * sizeof(float), SEEK_CUR);
|
||||
|
||||
if (!shared_weights && fread(w->wcls, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
|
||||
|
||||
// Check we didn't forget to read anything
|
||||
auto curr = ftell(f);
|
||||
fseek(f, 0, SEEK_END);
|
||||
auto end = ftell(f);
|
||||
if (curr != end) {
|
||||
printf("Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", curr, end);
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -115,6 +141,7 @@ void free_weights(TransformerWeights* w) {
|
||||
delete w->w2;
|
||||
delete w->w3;
|
||||
delete w->rms_final_weight;
|
||||
if (w->wcls) delete w->wcls;
|
||||
}
|
||||
|
||||
void print_sample_weights(TransformerWeights *w){
|
||||
@@ -131,6 +158,7 @@ void print_sample_weights(TransformerWeights *w){
|
||||
printf("%f\n", w->w2[0]);
|
||||
printf("%f\n", w->w3[0]);
|
||||
printf("%f\n", w->rms_att_weight[0]);
|
||||
if (w->wcls) printf("%f\n", w->wcls[0]);
|
||||
}
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@@ -509,26 +537,28 @@ bool is_ggml_file(const char *filename) {
|
||||
}
|
||||
|
||||
void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
|
||||
// heuristic to infer whether vocab is from ggml or from llama2.c vocabulary
|
||||
if (is_ggml_file(filename)) {
|
||||
|
||||
struct llama_context_params llama_params = llama_context_default_params();
|
||||
llama_params.vocab_only = true;
|
||||
|
||||
struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
|
||||
struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
|
||||
|
||||
const int n_vocab = llama_n_vocab(lctx);
|
||||
vocab->id_to_token.resize(n_vocab);
|
||||
for (int i=0; i<n_vocab; ++i) {
|
||||
vocab->id_to_token[i].text = llama_token_get_text(lctx, i);
|
||||
vocab->id_to_token[i].score = llama_token_get_score(lctx, i);
|
||||
vocab->id_to_token[i].type = llama_token_get_type(lctx, i);
|
||||
vocab->token_to_id.emplace(vocab->id_to_token[i].text, i);
|
||||
}
|
||||
llama_free(lctx);
|
||||
llama_free_model(lmodel);
|
||||
} else { // assume llama2.c vocabulary
|
||||
#pragma message("TODO: implement reading vocabulary using gguf")
|
||||
// // heuristic to infer whether vocab is from ggml or from llama2.c vocabulary
|
||||
// if (is_ggml_file(filename)) {
|
||||
//
|
||||
// struct llama_context_params llama_params = llama_context_default_params();
|
||||
// llama_params.vocab_only = true;
|
||||
//
|
||||
// struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
|
||||
// struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
|
||||
//
|
||||
// const int n_vocab = llama_n_vocab(lctx);
|
||||
// vocab->id_to_token.resize(n_vocab);
|
||||
// for (int i=0; i<n_vocab; ++i) {
|
||||
// vocab->id_to_token[i].text = llama_token_get_text(lctx, i);
|
||||
// vocab->id_to_token[i].score = llama_token_get_score(lctx, i);
|
||||
// vocab->id_to_token[i].type = llama_token_get_type(lctx, i);
|
||||
// vocab->token_to_id.emplace(vocab->id_to_token[i].text, i);
|
||||
// }
|
||||
// llama_free(lctx);
|
||||
// llama_free_model(lmodel);
|
||||
// } else
|
||||
{ // assume llama2.c vocabulary
|
||||
printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename);
|
||||
llama_file file(filename, "rb");
|
||||
const int n_vocab = config->vocab_size;
|
||||
@@ -538,6 +568,12 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
|
||||
float_t score = file.read_f32();
|
||||
uint32_t len = file.read_u32();
|
||||
std::string text = file.read_string(len);
|
||||
// Special-case handling of <0xXX> single byte tokens.
|
||||
char byte_val;
|
||||
if (sscanf(text.c_str(), "<0x%02hhX>", &byte_val) == 1) {
|
||||
char cstr[2] = { byte_val, 0 };
|
||||
text = cstr;
|
||||
}
|
||||
vocab->id_to_token[i].text = text;
|
||||
vocab->id_to_token[i].score = score;
|
||||
vocab->id_to_token[i].type = LLAMA_TOKEN_TYPE_UNDEFINED;
|
||||
@@ -589,83 +625,80 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
|
||||
}
|
||||
|
||||
#pragma message("TODO: implement file saving using gguf")
|
||||
(void) vocab;
|
||||
(void) model;
|
||||
(void) w;
|
||||
// // write_magic
|
||||
// file.write_u32(LLAMA_FILE_MAGIC); // magic
|
||||
// file.write_u32(LLAMA_FILE_VERSION); // version
|
||||
// // write_hparams
|
||||
// file.write_u32(model->hparams.n_vocab);
|
||||
// file.write_u32(model->hparams.n_embd);
|
||||
// file.write_u32(model->hparams.n_mult);
|
||||
// file.write_u32(model->hparams.n_head);
|
||||
// file.write_u32(model->hparams.n_layer);
|
||||
// file.write_u32(model->hparams.n_rot);
|
||||
// file.write_u32(LLAMA_FTYPE_ALL_F32);
|
||||
//
|
||||
// // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
|
||||
// uint32_t n_vocab = model->hparams.n_vocab;
|
||||
// for (uint32_t i = 0; i < n_vocab; i++) {
|
||||
// const auto & token_data = vocab->id_to_token.at(i);
|
||||
// file.write_u32((uint32_t) token_data.tok.size());
|
||||
// file.write_raw(token_data.tok.data(), token_data.tok.size());
|
||||
// file.write_raw(&token_data.score, sizeof(token_data.score));
|
||||
// }
|
||||
//
|
||||
// // stuff AK weights into GG weights one by one.
|
||||
// // w->token_embedding_table -> model->tok_embeddings
|
||||
// // float* -> struct ggml_tensor
|
||||
// stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
|
||||
// stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table);
|
||||
//
|
||||
// stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
|
||||
// //print_row(model->norm, 0);
|
||||
//
|
||||
// // for rms-att-weight
|
||||
// int row_length = model->hparams.n_embd;
|
||||
// const auto & hparams = model->hparams;
|
||||
// //int n_ff = model->hparams.n_embd;
|
||||
// int n_ff = get_n_ff(&hparams);
|
||||
//
|
||||
// for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
|
||||
// auto & layer = model->layers[i];
|
||||
// // 1d
|
||||
// stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
|
||||
// stuff_karpathy_weights_into_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]);
|
||||
//
|
||||
// // from 3d matrix layer x dim x dim to 2d matrix dim x dim
|
||||
// stuff_karpathy_weights_into_gg(layer.wq , &w->wq[i*row_length*row_length]);
|
||||
// stuff_karpathy_weights_into_gg(layer.wk , &w->wk[i*row_length*row_length]);
|
||||
// stuff_karpathy_weights_into_gg(layer.wv , &w->wv[i*row_length*row_length]);
|
||||
// stuff_karpathy_weights_into_gg(layer.wo , &w->wo[i*row_length*row_length]);
|
||||
//
|
||||
// stuff_karpathy_weights_into_gg(layer.w1 , &w->w1[i*row_length*n_ff]);
|
||||
// stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
|
||||
// stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]);
|
||||
// }
|
||||
// // write tensors
|
||||
// write_tensor(&file, model->tok_embeddings);
|
||||
// write_tensor(&file, model->norm);
|
||||
// write_tensor(&file, model->output); // ?
|
||||
// for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
|
||||
// auto & layer = model->layers[i];
|
||||
//
|
||||
// write_tensor(&file, layer.attention_norm);
|
||||
// write_tensor(&file, layer.wq);
|
||||
// write_tensor(&file, layer.wk);
|
||||
// write_tensor(&file, layer.wv);
|
||||
// write_tensor(&file, layer.wo);
|
||||
// write_tensor(&file, layer.ffn_norm);
|
||||
// write_tensor(&file, layer.w1);
|
||||
// write_tensor(&file, layer.w2);
|
||||
// write_tensor(&file, layer.w3);
|
||||
// }
|
||||
// write_magic
|
||||
file.write_u32(LLAMA_FILE_MAGIC_GGJT); // magic
|
||||
file.write_u32(LLAMA_FILE_VERSION_GGJT_V3); // version
|
||||
// write_hparams
|
||||
file.write_u32(model->hparams.n_vocab);
|
||||
file.write_u32(model->hparams.n_embd);
|
||||
file.write_u32(model->hparams.n_mult);
|
||||
file.write_u32(model->hparams.n_head);
|
||||
file.write_u32(model->hparams.n_layer);
|
||||
file.write_u32(model->hparams.n_rot);
|
||||
file.write_u32(LLAMA_FTYPE_ALL_F32);
|
||||
|
||||
// write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
|
||||
uint32_t n_vocab = model->hparams.n_vocab;
|
||||
for (uint32_t i = 0; i < n_vocab; i++) {
|
||||
const auto & token_data = vocab->id_to_token.at(i);
|
||||
file.write_u32((uint32_t) token_data.text.size());
|
||||
file.write_raw(token_data.text.data(), token_data.text.size());
|
||||
file.write_raw(&token_data.score, sizeof(token_data.score));
|
||||
}
|
||||
|
||||
// stuff AK weights into GG weights one by one.
|
||||
// w->token_embedding_table -> model->tok_embeddings
|
||||
// float* -> struct ggml_tensor
|
||||
stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
|
||||
stuff_karpathy_weights_into_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);
|
||||
|
||||
stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
|
||||
//print_row(model->norm, 0);
|
||||
|
||||
// for rms-att-weight
|
||||
int row_length = model->hparams.n_embd;
|
||||
const auto & hparams = model->hparams;
|
||||
//int n_ff = model->hparams.n_embd;
|
||||
int n_ff = get_n_ff(&hparams);
|
||||
|
||||
for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
|
||||
auto & layer = model->layers[i];
|
||||
// 1d
|
||||
stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
|
||||
stuff_karpathy_weights_into_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]);
|
||||
|
||||
// from 3d matrix layer x dim x dim to 2d matrix dim x dim
|
||||
stuff_karpathy_weights_into_gg(layer.wq , &w->wq[i*row_length*row_length]);
|
||||
stuff_karpathy_weights_into_gg(layer.wk , &w->wk[i*row_length*row_length]);
|
||||
stuff_karpathy_weights_into_gg(layer.wv , &w->wv[i*row_length*row_length]);
|
||||
stuff_karpathy_weights_into_gg(layer.wo , &w->wo[i*row_length*row_length]);
|
||||
|
||||
stuff_karpathy_weights_into_gg(layer.w1 , &w->w1[i*row_length*n_ff]);
|
||||
stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
|
||||
stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]);
|
||||
}
|
||||
// write tensors
|
||||
write_tensor(&file, model->tok_embeddings);
|
||||
write_tensor(&file, model->norm);
|
||||
write_tensor(&file, model->output); // ?
|
||||
for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
|
||||
auto & layer = model->layers[i];
|
||||
|
||||
write_tensor(&file, layer.attention_norm);
|
||||
write_tensor(&file, layer.wq);
|
||||
write_tensor(&file, layer.wk);
|
||||
write_tensor(&file, layer.wv);
|
||||
write_tensor(&file, layer.wo);
|
||||
write_tensor(&file, layer.ffn_norm);
|
||||
write_tensor(&file, layer.w1);
|
||||
write_tensor(&file, layer.w2);
|
||||
write_tensor(&file, layer.w3);
|
||||
}
|
||||
}
|
||||
|
||||
struct train_params get_default_train_params() {
|
||||
struct train_params params;
|
||||
params.fn_vocab_model = "models/ggml-vocab.bin";
|
||||
params.fn_vocab_model = "tokenizer.bin";
|
||||
params.fn_llama2c_output_model = "ak_llama_model.bin";
|
||||
params.fn_train_data = "shakespeare.txt";
|
||||
params.fn_checkpoint_in = "checkpoint.bin";
|
||||
@@ -718,7 +751,7 @@ void print_usage(int /*argc*/, char ** argv, const struct train_params * params)
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "options:\n");
|
||||
fprintf(stderr, " -h, --help show this help message and exit\n");
|
||||
fprintf(stderr, " --copy-vocab-from-model FNAME llama2.c vocabulary or ggml model path from which to copy vocab (default '%s')\n", params->fn_vocab_model);
|
||||
fprintf(stderr, " --copy-vocab-from-model FNAME llama2.c vocabulary or ggmlv3 model path from which to copy vocab (default '%s')\n", params->fn_vocab_model);
|
||||
fprintf(stderr, " --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model\n");
|
||||
fprintf(stderr, " --llama2c-output-model FNAME model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model);
|
||||
fprintf(stderr, "\n");
|
||||
@@ -791,9 +824,12 @@ int main(int argc, char ** argv) {
|
||||
if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; }
|
||||
// read in the config header
|
||||
if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
|
||||
auto shared_weights = config.vocab_size > 0;
|
||||
config.vocab_size = abs(config.vocab_size);
|
||||
|
||||
// read in the Transformer weights
|
||||
malloc_weights(&weights, &config);
|
||||
if(checkpoint_init_weights(&weights, &config, file)) { return 1; }
|
||||
malloc_weights(&weights, &config, shared_weights);
|
||||
if(checkpoint_init_weights(&weights, &config, file, shared_weights)) { return 1; }
|
||||
fclose(file);
|
||||
}
|
||||
|
||||
|
||||
1
examples/embd-input/embd_input.py
Normal file → Executable file
1
examples/embd-input/embd_input.py
Normal file → Executable file
@@ -1,3 +1,4 @@
|
||||
#!/usr/bin/env python3
|
||||
import ctypes
|
||||
from ctypes import cdll, c_char_p, c_void_p, POINTER, c_float, c_int
|
||||
import numpy as np
|
||||
|
||||
1
examples/embd-input/llava.py
Normal file → Executable file
1
examples/embd-input/llava.py
Normal file → Executable file
@@ -1,3 +1,4 @@
|
||||
#!/usr/bin/env python3
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
|
||||
1
examples/embd-input/minigpt4.py
Normal file → Executable file
1
examples/embd-input/minigpt4.py
Normal file → Executable file
@@ -1,3 +1,4 @@
|
||||
#!/usr/bin/env python3
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
|
||||
1
examples/embd-input/panda_gpt.py
Normal file → Executable file
1
examples/embd-input/panda_gpt.py
Normal file → Executable file
@@ -1,3 +1,4 @@
|
||||
#!/usr/bin/env python3
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
|
||||
1
examples/jeopardy/graph.py
Normal file → Executable file
1
examples/jeopardy/graph.py
Normal file → Executable file
@@ -1,3 +1,4 @@
|
||||
#!/usr/bin/env python3
|
||||
import matplotlib.pyplot as plt
|
||||
import os
|
||||
import csv
|
||||
|
||||
0
examples/jeopardy/jeopardy.sh
Normal file → Executable file
0
examples/jeopardy/jeopardy.sh
Normal file → Executable file
1
examples/json-schema-to-grammar.py
Normal file → Executable file
1
examples/json-schema-to-grammar.py
Normal file → Executable file
@@ -1,3 +1,4 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
|
||||
@@ -288,6 +288,10 @@ These options help improve the performance and memory usage of the LLaMA models.
|
||||
|
||||
- `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation.
|
||||
|
||||
### Grammars
|
||||
|
||||
- `--grammar GRAMMAR`, `--grammar-file FILE`: Specify a grammar (defined inline or in a file) to constrain model output to a specific format. For example, you could force the model to output JSON or to speak only in emojis. See the [GBNF guide](../../grammars/README.md) for details on the syntax.
|
||||
|
||||
### Quantization
|
||||
|
||||
For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-data--run).
|
||||
|
||||
@@ -43,7 +43,7 @@ static bool is_interacting = false;
|
||||
void sigint_handler(int signo) {
|
||||
if (signo == SIGINT) {
|
||||
if (!is_interacting) {
|
||||
is_interacting=true;
|
||||
is_interacting = true;
|
||||
} else {
|
||||
console::cleanup();
|
||||
printf("\n");
|
||||
@@ -189,23 +189,30 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
|
||||
|
||||
// tokenize the prompt
|
||||
std::vector<llama_token> embd_inp;
|
||||
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
|
||||
embd_inp = ::llama_tokenize(ctx, params.prompt, true);
|
||||
embd_inp = ::llama_tokenize(ctx, params.prompt, is_spm);
|
||||
} else {
|
||||
embd_inp = session_tokens;
|
||||
}
|
||||
|
||||
// Should not run without any tokens
|
||||
if (embd_inp.empty()) {
|
||||
embd_inp.push_back(llama_token_bos(ctx));
|
||||
}
|
||||
|
||||
// Tokenize negative prompt
|
||||
std::vector<llama_token> guidance_inp;
|
||||
int guidance_offset = 0;
|
||||
int original_prompt_len = 0;
|
||||
if (ctx_guidance) {
|
||||
params.cfg_negative_prompt.insert(0, 1, ' ');
|
||||
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, true);
|
||||
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, is_spm);
|
||||
|
||||
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
|
||||
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, is_spm);
|
||||
original_prompt_len = original_inp.size();
|
||||
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
|
||||
}
|
||||
@@ -252,8 +259,8 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// prefix & suffix for instruct mode
|
||||
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true);
|
||||
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
|
||||
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", is_spm);
|
||||
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
|
||||
|
||||
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
||||
if (params.instruct) {
|
||||
|
||||
1
examples/make-ggml.py
Normal file → Executable file
1
examples/make-ggml.py
Normal file → Executable file
@@ -1,3 +1,4 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
This script converts Hugging Face llama models to GGML and quantizes them.
|
||||
|
||||
|
||||
@@ -27,12 +27,136 @@ std::vector<float> softmax(const std::vector<float>& logits) {
|
||||
return probs;
|
||||
}
|
||||
|
||||
void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||
void perplexity_v2(llama_context * ctx, const gpt_params & params) {
|
||||
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
|
||||
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
||||
// Output: `perplexity: 13.5106 [114/114]`
|
||||
// BOS tokens will be added for each chunk before eval
|
||||
auto tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||
|
||||
if (params.ppl_stride <= 0) {
|
||||
fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
|
||||
return;
|
||||
}
|
||||
|
||||
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
|
||||
const bool add_bos = is_spm;
|
||||
|
||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
||||
|
||||
auto tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||
|
||||
const int calc_chunk = params.n_ctx;
|
||||
|
||||
fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
|
||||
|
||||
if (int(tokens.size()) <= calc_chunk) {
|
||||
fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
|
||||
tokens.size(), params.n_ctx, params.ppl_stride);
|
||||
return;
|
||||
}
|
||||
|
||||
const int n_chunk_max = (tokens.size() - calc_chunk + params.ppl_stride - 1) / params.ppl_stride;
|
||||
|
||||
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
|
||||
const int n_vocab = llama_n_vocab(ctx);
|
||||
const int n_batch = params.n_batch;
|
||||
|
||||
int count = 0;
|
||||
double nll = 0.0;
|
||||
|
||||
fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
|
||||
|
||||
for (int i = 0; i < n_chunk; ++i) {
|
||||
const int start = i * params.ppl_stride;
|
||||
const int end = start + calc_chunk;
|
||||
|
||||
const int num_batches = (calc_chunk + n_batch - 1) / n_batch;
|
||||
//fprintf(stderr, "%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
|
||||
|
||||
std::vector<float> logits;
|
||||
|
||||
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
for (int j = 0; j < num_batches; ++j) {
|
||||
const int batch_start = start + j * n_batch;
|
||||
const int batch_size = std::min(end - batch_start, n_batch);
|
||||
|
||||
//fprintf(stderr, " Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
|
||||
if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
|
||||
//fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
// save original token and restore it after eval
|
||||
const auto token_org = tokens[batch_start];
|
||||
|
||||
// add BOS token for the first batch of each chunk
|
||||
if (add_bos && j == 0) {
|
||||
tokens[batch_start] = llama_token_bos(ctx);
|
||||
}
|
||||
|
||||
const auto batch_logits = llama_get_logits(ctx);
|
||||
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
|
||||
|
||||
if (j == 0) {
|
||||
tokens[batch_start] = token_org;
|
||||
}
|
||||
}
|
||||
|
||||
const auto t_end = std::chrono::high_resolution_clock::now();
|
||||
|
||||
if (i == 0) {
|
||||
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
|
||||
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
||||
int total_seconds = (int)(t_total * n_chunk);
|
||||
if (total_seconds >= 60*60) {
|
||||
fprintf(stderr, "%d hours ", total_seconds / (60*60));
|
||||
total_seconds = total_seconds % (60*60);
|
||||
}
|
||||
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
|
||||
}
|
||||
|
||||
//fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
|
||||
for (int j = params.n_ctx - params.ppl_stride - 1; j < params.n_ctx - 1; ++j) {
|
||||
|
||||
// Calculate probability of next token, given the previous ones.
|
||||
const std::vector<float> tok_logits(
|
||||
logits.begin() + (j + 0) * n_vocab,
|
||||
logits.begin() + (j + 1) * n_vocab);
|
||||
|
||||
const float prob = softmax(tok_logits)[tokens[start + j + 1]];
|
||||
|
||||
nll += -std::log(prob);
|
||||
++count;
|
||||
}
|
||||
// perplexity is e^(average negative log-likelihood)
|
||||
if (params.ppl_output_type == 0) {
|
||||
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
|
||||
} else {
|
||||
printf("%8d %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
|
||||
}
|
||||
fflush(stdout);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||
if (params.ppl_stride > 0) {
|
||||
perplexity_v2(ctx, params);
|
||||
return;
|
||||
}
|
||||
|
||||
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
|
||||
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
||||
// Output: `perplexity: 13.5106 [114/114]`
|
||||
// BOS tokens will be added for each chunk before eval
|
||||
|
||||
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
|
||||
const bool add_bos = is_spm;
|
||||
|
||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
||||
|
||||
auto tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||
|
||||
const int n_chunk_max = tokens.size() / params.n_ctx;
|
||||
|
||||
@@ -63,7 +187,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||
const auto token_org = tokens[batch_start];
|
||||
|
||||
// add BOS token for the first batch of each chunk
|
||||
if (j == 0) {
|
||||
if (add_bos && j == 0) {
|
||||
tokens[batch_start] = llama_token_bos(ctx);
|
||||
}
|
||||
|
||||
@@ -116,7 +240,11 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||
++count;
|
||||
}
|
||||
// perplexity is e^(average negative log-likelihood)
|
||||
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
|
||||
if (params.ppl_output_type == 0) {
|
||||
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
|
||||
} else {
|
||||
printf("%8d %.4lf\n", i*params.n_ctx, std::exp(nll / count));
|
||||
}
|
||||
fflush(stdout);
|
||||
}
|
||||
printf("\n");
|
||||
@@ -177,8 +305,10 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||
size_t hs_task_count = prompt_lines.size()/6;
|
||||
fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
|
||||
|
||||
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
|
||||
|
||||
// This is needed as usual for LLaMA models
|
||||
bool prepend_bos = true;
|
||||
const bool add_bos = is_spm;
|
||||
|
||||
// Number of tasks to use when computing the score
|
||||
if ( params.hellaswag_tasks < hs_task_count ) {
|
||||
@@ -234,14 +364,13 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||
std::vector<float> tok_logits(n_vocab);
|
||||
|
||||
for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) {
|
||||
|
||||
// Tokenize the context to count tokens
|
||||
std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, prepend_bos);
|
||||
std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, add_bos);
|
||||
size_t context_size = context_embd.size();
|
||||
|
||||
// Do the 1st ending
|
||||
// In this case we include the context when evaluating
|
||||
auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], prepend_bos);
|
||||
auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], add_bos);
|
||||
auto query_size = query_embd.size();
|
||||
//printf("First query: %d\n",(int)query_size);
|
||||
|
||||
@@ -369,6 +498,12 @@ int main(int argc, char ** argv) {
|
||||
params.perplexity = true;
|
||||
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
||||
|
||||
if (params.ppl_stride > 0) {
|
||||
fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
|
||||
params.n_ctx, params.n_ctx + params.ppl_stride/2);
|
||||
params.n_ctx += params.ppl_stride/2;
|
||||
}
|
||||
|
||||
if (params.n_ctx > 2048) {
|
||||
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
|
||||
"expect poor results\n", __func__, params.n_ctx);
|
||||
|
||||
@@ -14,25 +14,25 @@ struct quant_option {
|
||||
};
|
||||
|
||||
static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||
{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 3.50G, +0.2499 ppl @ 7B", },
|
||||
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1846 ppl @ 7B", },
|
||||
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.30G, +0.0796 ppl @ 7B", },
|
||||
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0415 ppl @ 7B", },
|
||||
{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 3.56G, +0.2166 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
|
||||
#ifdef GGML_USE_K_QUANTS
|
||||
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.67G, +0.8698 ppl @ 7B", },
|
||||
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
|
||||
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5505 ppl @ 7B", },
|
||||
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.06G, +0.2437 ppl @ 7B", },
|
||||
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1803 ppl @ 7B", },
|
||||
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
|
||||
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.56G, +0.1149 ppl @ 7B", },
|
||||
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0535 ppl @ 7B", },
|
||||
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
|
||||
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0353 ppl @ 7B", },
|
||||
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0142 ppl @ 7B", },
|
||||
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0044 ppl @ 7B", },
|
||||
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, -0.0008 ppl @ LLaMA-v1-7B", },
|
||||
#endif
|
||||
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ 7B", },
|
||||
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
|
||||
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
|
||||
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
||||
};
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
|
||||
#!/bin/bash
|
||||
|
||||
cd `dirname $0`
|
||||
|
||||
0
examples/server-llama2-13B.sh
Normal file → Executable file
0
examples/server-llama2-13B.sh
Normal file → Executable file
@@ -126,7 +126,7 @@ node .
|
||||
|
||||
`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
|
||||
|
||||
`prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. A space is inserted in the front like main.cpp does.
|
||||
`prompt`: Provide a prompt as a string, or as an array of strings and numbers representing tokens. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. If the prompt is a string, or an array with the first element given as a string, a space is inserted in the front like main.cpp does.
|
||||
|
||||
`stop`: Specify a JSON array of stopping strings.
|
||||
These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []).
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
from flask import Flask, jsonify, request, Response
|
||||
import urllib.parse
|
||||
|
||||
0
examples/server/chat-llama2.sh
Normal file → Executable file
0
examples/server/chat-llama2.sh
Normal file → Executable file
0
examples/server/chat.sh
Normal file → Executable file
0
examples/server/chat.sh
Normal file → Executable file
@@ -190,6 +190,7 @@ struct llama_server_context
|
||||
size_t n_past = 0;
|
||||
size_t n_remain = 0;
|
||||
|
||||
json prompt;
|
||||
std::vector<llama_token> embd;
|
||||
std::vector<llama_token> last_n_tokens;
|
||||
|
||||
@@ -267,6 +268,53 @@ struct llama_server_context
|
||||
return true;
|
||||
}
|
||||
|
||||
std::vector<llama_token> tokenize(json json_prompt, bool add_bos)
|
||||
{
|
||||
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
|
||||
// or the first element of the json_prompt array is a string.
|
||||
std::vector<llama_token> prompt_tokens;
|
||||
|
||||
if (json_prompt.is_array())
|
||||
{
|
||||
bool first = true;
|
||||
for (const auto& p : json_prompt)
|
||||
{
|
||||
if (p.is_string())
|
||||
{
|
||||
auto s = p.template get<std::string>();
|
||||
std::vector<llama_token> p;
|
||||
if (first)
|
||||
{
|
||||
s.insert(0, 1, ' '); // add a space if it's the first
|
||||
p = ::llama_tokenize(ctx, s, add_bos);
|
||||
first = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
p = ::llama_tokenize(ctx, s, false);
|
||||
}
|
||||
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
|
||||
}
|
||||
else
|
||||
{
|
||||
if (first)
|
||||
{
|
||||
first = false;
|
||||
}
|
||||
prompt_tokens.push_back(p.template get<llama_token>());
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
auto s = json_prompt.template get<std::string>();
|
||||
s.insert(0, 1, ' '); // always add a first space
|
||||
prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
|
||||
}
|
||||
|
||||
return prompt_tokens;
|
||||
}
|
||||
|
||||
bool loadGrammar()
|
||||
{
|
||||
if (!params.grammar.empty()) {
|
||||
@@ -294,8 +342,8 @@ struct llama_server_context
|
||||
|
||||
void loadPrompt()
|
||||
{
|
||||
params.prompt.insert(0, 1, ' '); // always add a first space
|
||||
std::vector<llama_token> prompt_tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||
auto prompt_tokens = tokenize(prompt, true); // always add BOS
|
||||
|
||||
num_prompt_tokens = prompt_tokens.size();
|
||||
|
||||
if (params.n_keep < 0)
|
||||
@@ -1016,7 +1064,7 @@ static json format_final_response(llama_server_context &llama, const std::string
|
||||
{"tokens_predicted", llama.num_tokens_predicted},
|
||||
{"tokens_evaluated", llama.num_prompt_tokens},
|
||||
{"generation_settings", format_generation_settings(llama)},
|
||||
{"prompt", llama.params.prompt},
|
||||
{"prompt", llama.prompt},
|
||||
{"truncated", llama.truncated},
|
||||
{"stopped_eos", llama.stopped_eos},
|
||||
{"stopped_word", llama.stopped_word},
|
||||
@@ -1085,10 +1133,18 @@ static void parse_options_completion(const json &body, llama_server_context &lla
|
||||
llama.params.penalize_nl = json_value(body, "penalize_nl", default_params.penalize_nl);
|
||||
llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep);
|
||||
llama.params.seed = json_value(body, "seed", default_params.seed);
|
||||
llama.params.prompt = json_value(body, "prompt", default_params.prompt);
|
||||
llama.params.grammar = json_value(body, "grammar", default_params.grammar);
|
||||
llama.params.n_probs = json_value(body, "n_probs", default_params.n_probs);
|
||||
|
||||
if (body.count("prompt") != 0)
|
||||
{
|
||||
llama.prompt = body["prompt"];
|
||||
}
|
||||
else
|
||||
{
|
||||
llama.prompt = "";
|
||||
}
|
||||
|
||||
llama.params.logit_bias.clear();
|
||||
if (json_value(body, "ignore_eos", false))
|
||||
{
|
||||
@@ -1345,8 +1401,11 @@ int main(int argc, char **argv)
|
||||
auto lock = llama.lock();
|
||||
|
||||
const json body = json::parse(req.body);
|
||||
const std::string content = json_value<std::string>(body, "content", "");
|
||||
const std::vector<llama_token> tokens = llama_tokenize(llama.ctx, content, false);
|
||||
std::vector<llama_token> tokens;
|
||||
if (body.count("content") != 0)
|
||||
{
|
||||
tokens = llama.tokenize(body["content"], false);
|
||||
}
|
||||
const json data = format_tokenizer_response(tokens);
|
||||
return res.set_content(data.dump(), "application/json"); });
|
||||
|
||||
@@ -1358,7 +1417,14 @@ int main(int argc, char **argv)
|
||||
|
||||
llama.rewind();
|
||||
llama_reset_timings(llama.ctx);
|
||||
llama.params.prompt = json_value<std::string>(body, "content", "");
|
||||
if (body.count("content") != 0)
|
||||
{
|
||||
llama.prompt = body["content"];
|
||||
}
|
||||
else
|
||||
{
|
||||
llama.prompt = "";
|
||||
}
|
||||
llama.params.n_predict = 0;
|
||||
llama.loadPrompt();
|
||||
llama.beginCompletion();
|
||||
|
||||
@@ -238,7 +238,7 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
|
||||
alloc->n_free_blocks++;
|
||||
}
|
||||
|
||||
void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n) {
|
||||
void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) {
|
||||
int pos = 0;
|
||||
for (int i = 0; i < n; i++) {
|
||||
if (list[i] != -1) {
|
||||
@@ -547,7 +547,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
||||
struct ggml_tensor * view_src = get_view_source(parent);
|
||||
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
||||
view_src_hn->n_views -= 1;
|
||||
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src->n_children, view_src->n_views);
|
||||
AT_PRINTF("view_src %s\n", view_src->name);
|
||||
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
|
||||
ggml_allocator_free_tensor(alloc, view_src);
|
||||
}
|
||||
|
||||
@@ -12,7 +12,7 @@ GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
|
||||
|
||||
// tell the allocator to parse nodes following the order described in the list
|
||||
// you should call this if your graph are optimized to execute out-of-order
|
||||
GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n);
|
||||
GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
|
||||
|
||||
GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
|
||||
GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
|
||||
|
||||
29
ggml-cuda.cu
29
ggml-cuda.cu
@@ -3907,6 +3907,29 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
|
||||
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
||||
}
|
||||
|
||||
// TODO: this implementation is wrong!
|
||||
//static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
|
||||
// const float p_delta, const int p_delta_rows, const float theta_scale) {
|
||||
// const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
||||
//
|
||||
// if (col >= ncols) {
|
||||
// return;
|
||||
// }
|
||||
//
|
||||
// const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
// const int i = row*ncols + col/2;
|
||||
//
|
||||
// const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
|
||||
// const float sin_theta = sinf(theta);
|
||||
// const float cos_theta = cosf(theta);
|
||||
//
|
||||
// const float x0 = x[i + 0];
|
||||
// const float x1 = x[i + ncols/2];
|
||||
//
|
||||
// dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
||||
// dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
||||
//}
|
||||
|
||||
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
|
||||
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
const int half_n_dims = ncols/4;
|
||||
@@ -5515,7 +5538,8 @@ inline void ggml_cuda_op_rope(
|
||||
|
||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||
|
||||
const bool is_glm = mode & 4;
|
||||
const bool is_neox = mode & 2;
|
||||
const bool is_glm = mode & 4;
|
||||
|
||||
// compute
|
||||
if (is_glm) {
|
||||
@@ -5523,6 +5547,9 @@ inline void ggml_cuda_op_rope(
|
||||
const float id_p = min(p, n_ctx - 2.f);
|
||||
const float block_p = max(p - (n_ctx - 2.f), 0.f);
|
||||
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
|
||||
} else if (is_neox) {
|
||||
GGML_ASSERT(false && "RoPE NeoX not implemented yet");
|
||||
#pragma message("TODO: implement RoPE NeoX for CUDA")
|
||||
} else {
|
||||
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
||||
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
||||
|
||||
132
ggml-metal.m
132
ggml-metal.m
@@ -167,7 +167,9 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
#define GGML_METAL_ADD_KERNEL(name) \
|
||||
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
|
||||
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
|
||||
fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name); \
|
||||
fprintf(stderr, "%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
|
||||
(int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
|
||||
(int) ctx->pipeline_##name.threadExecutionWidth); \
|
||||
if (error) { \
|
||||
fprintf(stderr, "%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
|
||||
return NULL; \
|
||||
@@ -218,12 +220,12 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
#undef GGML_METAL_ADD_KERNEL
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||
fprintf(stderr, "%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
||||
fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||
fprintf(stderr, "%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
||||
if (ctx->device.maxTransferRate != 0) {
|
||||
fprintf(stderr, "%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
|
||||
fprintf(stderr, "%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
|
||||
} else {
|
||||
fprintf(stderr, "%s: maxTransferRate = built-in GPU\n", __func__);
|
||||
fprintf(stderr, "%s: maxTransferRate = built-in GPU\n", __func__);
|
||||
}
|
||||
|
||||
return ctx;
|
||||
@@ -537,8 +539,8 @@ void ggml_metal_graph_compute(
|
||||
|
||||
id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
|
||||
|
||||
const int node_start = (cb_idx + 0) * n_nodes_per_cb;
|
||||
const int node_end = (cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb;
|
||||
const int node_start = (cb_idx + 0) * n_nodes_per_cb;
|
||||
const int node_end = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes);
|
||||
|
||||
for (int ind = node_start; ind < node_end; ++ind) {
|
||||
const int i = has_concur ? ctx->concur_list[ind] : ind;
|
||||
@@ -744,32 +746,31 @@ void ggml_metal_graph_compute(
|
||||
[ctx->device supportsFamily:MTLGPUFamilyApple7] &&
|
||||
ne00%32 == 0 &&
|
||||
ne11 > 1) {
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break;
|
||||
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break;
|
||||
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break;
|
||||
case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q2_K_f32]; break;
|
||||
case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q3_K_f32]; break;
|
||||
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_K_f32]; break;
|
||||
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_K_f32]; break;
|
||||
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q6_K_f32]; break;
|
||||
default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
|
||||
}
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
||||
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:5];
|
||||
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:6];
|
||||
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7];
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:8];
|
||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:9];
|
||||
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:10];
|
||||
[encoder setThreadgroupMemoryLength:8192 atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break;
|
||||
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break;
|
||||
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break;
|
||||
case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q2_K_f32]; break;
|
||||
case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q3_K_f32]; break;
|
||||
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_K_f32]; break;
|
||||
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_K_f32]; break;
|
||||
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q6_K_f32]; break;
|
||||
default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
|
||||
}
|
||||
else {
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
||||
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:5];
|
||||
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:6];
|
||||
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7];
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:8];
|
||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:9];
|
||||
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:10];
|
||||
[encoder setThreadgroupMemoryLength:8192 atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
|
||||
} else {
|
||||
int nth0 = 32;
|
||||
int nth1 = 1;
|
||||
|
||||
@@ -868,24 +869,24 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:15];
|
||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16];
|
||||
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:17];
|
||||
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:17];
|
||||
|
||||
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
|
||||
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q3_K) {
|
||||
#ifdef GGML_QKK_64
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
#else
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01+3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
#endif
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q5_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q6_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
} else {
|
||||
[encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
@@ -938,16 +939,17 @@ void ggml_metal_graph_compute(
|
||||
} break;
|
||||
case GGML_OP_NORM:
|
||||
{
|
||||
const float eps = 1e-5f;
|
||||
float eps;
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
|
||||
const int nth = 256;
|
||||
|
||||
[encoder setComputePipelineState:ctx->pipeline_norm];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
|
||||
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
|
||||
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
|
||||
[encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
|
||||
|
||||
const int64_t nrows = ggml_nrows(src0);
|
||||
@@ -990,7 +992,9 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
|
||||
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
|
||||
[encoder setBytes:&m0 length:sizeof( float) atIndex:18];
|
||||
|
||||
const int nth = 32;
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_ROPE:
|
||||
@@ -1005,8 +1009,8 @@ void ggml_metal_graph_compute(
|
||||
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
||||
|
||||
[encoder setComputePipelineState:ctx->pipeline_rope];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
|
||||
@@ -1057,24 +1061,24 @@ void ggml_metal_graph_compute(
|
||||
default: GGML_ASSERT(false && "not implemented");
|
||||
}
|
||||
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
|
||||
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
|
||||
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
|
||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
|
||||
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
|
||||
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
|
||||
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
|
||||
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
|
||||
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
|
||||
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
|
||||
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
|
||||
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
|
||||
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
|
||||
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
|
||||
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
|
||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
|
||||
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
|
||||
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
|
||||
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
|
||||
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
|
||||
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
|
||||
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
|
||||
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
|
||||
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
|
||||
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
} break;
|
||||
|
||||
@@ -87,7 +87,12 @@ kernel void kernel_gelu(
|
||||
device float * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
float x = src0[tpig];
|
||||
dst[tpig] = 0.5f*x*(1.0f + tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
|
||||
|
||||
// BEWARE !!!
|
||||
// Simply using "tanh" instead of "precise::tanh" will sometimes results in NaNs!
|
||||
// This was observed with Falcon 7B and 40B models
|
||||
//
|
||||
dst[tpig] = 0.5f*x*(1.0f + precise::tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
|
||||
}
|
||||
|
||||
kernel void kernel_soft_max(
|
||||
@@ -571,7 +576,25 @@ kernel void kernel_rope(
|
||||
dst_data[1] = x0*sin_theta + x1*cos_theta;
|
||||
}
|
||||
} else {
|
||||
// TODO: implement
|
||||
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
||||
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
||||
const float cos_theta = cos(theta);
|
||||
const float sin_theta = sin(theta);
|
||||
|
||||
theta *= theta_scale;
|
||||
|
||||
const int64_t i0 = ib*n_dims + ic/2;
|
||||
|
||||
device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
const float x0 = src[0];
|
||||
const float x1 = src[n_dims/2];
|
||||
|
||||
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
||||
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
30
ggml.c
30
ggml.c
@@ -3554,9 +3554,9 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
|
||||
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
|
||||
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
||||
|
||||
static const float GELU_COEF_A = 0.044715f;
|
||||
static const float GELU_QUICK_COEF = -1.702f;
|
||||
static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
||||
static const float GELU_COEF_A = 0.044715f;
|
||||
static const float GELU_QUICK_COEF = -1.702f;
|
||||
static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
||||
|
||||
inline static float ggml_gelu_f32(float x) {
|
||||
return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
|
||||
@@ -5555,10 +5555,6 @@ struct ggml_tensor * ggml_repeat(
|
||||
is_node = true;
|
||||
}
|
||||
|
||||
if (ggml_are_same_shape(a, b) && !is_node) {
|
||||
return a;
|
||||
}
|
||||
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);
|
||||
|
||||
result->op = GGML_OP_REPEAT;
|
||||
@@ -5789,6 +5785,7 @@ struct ggml_tensor * ggml_silu_back(
|
||||
static struct ggml_tensor * ggml_norm_impl(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
float eps,
|
||||
bool inplace) {
|
||||
bool is_node = false;
|
||||
|
||||
@@ -5799,7 +5796,7 @@ static struct ggml_tensor * ggml_norm_impl(
|
||||
|
||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||
|
||||
// TODO: maybe store epsilon here?
|
||||
ggml_set_op_params(result, &eps, sizeof(eps));
|
||||
|
||||
result->op = GGML_OP_NORM;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
@@ -5810,14 +5807,16 @@ static struct ggml_tensor * ggml_norm_impl(
|
||||
|
||||
struct ggml_tensor * ggml_norm(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a) {
|
||||
return ggml_norm_impl(ctx, a, false);
|
||||
struct ggml_tensor * a,
|
||||
float eps) {
|
||||
return ggml_norm_impl(ctx, a, eps, false);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_norm_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a) {
|
||||
return ggml_norm_impl(ctx, a, true);
|
||||
struct ggml_tensor * a,
|
||||
float eps) {
|
||||
return ggml_norm_impl(ctx, a, eps, true);
|
||||
}
|
||||
|
||||
// ggml_rms_norm
|
||||
@@ -10619,7 +10618,8 @@ static void ggml_compute_forward_norm_f32(
|
||||
|
||||
GGML_TENSOR_UNARY_OP_LOCALS;
|
||||
|
||||
const float eps = 1e-5f; // TODO: make this a parameter
|
||||
float eps;
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
|
||||
// TODO: optimize
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
@@ -12537,7 +12537,7 @@ static void ggml_compute_forward_rope_f32(
|
||||
dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
|
||||
}
|
||||
} else {
|
||||
// TODO: this is probably wrong, but I can't figure it out ..
|
||||
// TODO: this might be wrong for ne0 != n_dims - need double check
|
||||
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
|
||||
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
||||
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
||||
@@ -12666,7 +12666,7 @@ static void ggml_compute_forward_rope_f16(
|
||||
dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
||||
}
|
||||
} else {
|
||||
// TODO: this is probably wrong, but I can't figure it out ..
|
||||
// TODO: this might be wrong for ne0 != n_dims - need double check
|
||||
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
|
||||
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
||||
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
||||
|
||||
7
ggml.h
7
ggml.h
@@ -909,14 +909,15 @@ extern "C" {
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// normalize along rows
|
||||
// TODO: eps is hardcoded to 1e-5 for now
|
||||
GGML_API struct ggml_tensor * ggml_norm(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
struct ggml_tensor * a,
|
||||
float eps);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_norm_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
struct ggml_tensor * a,
|
||||
float eps);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_rms_norm(
|
||||
struct ggml_context * ctx,
|
||||
|
||||
27
gguf.py
Normal file → Executable file
27
gguf.py
Normal file → Executable file
@@ -1,3 +1,4 @@
|
||||
#!/usr/bin/env python3
|
||||
import shutil
|
||||
import sys
|
||||
import struct
|
||||
@@ -29,12 +30,12 @@ KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository"
|
||||
KEY_GENERAL_FILE_TYPE = "general.file_type"
|
||||
|
||||
# LLM
|
||||
KEY_LLM_CONTEXT_LENGTH = "{arch}.context_length"
|
||||
KEY_LLM_EMBEDDING_LENGTH = "{arch}.embedding_length"
|
||||
KEY_LLM_BLOCK_COUNT = "{arch}.block_count"
|
||||
KEY_LLM_FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
|
||||
KEY_LLM_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
|
||||
KEY_LLM_TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
|
||||
KEY_CONTEXT_LENGTH = "{arch}.context_length"
|
||||
KEY_EMBEDDING_LENGTH = "{arch}.embedding_length"
|
||||
KEY_BLOCK_COUNT = "{arch}.block_count"
|
||||
KEY_FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
|
||||
KEY_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
|
||||
KEY_TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
|
||||
|
||||
# attention
|
||||
KEY_ATTENTION_HEAD_COUNT = "{arch}.attention.head_count"
|
||||
@@ -582,7 +583,7 @@ class GGUFWriter:
|
||||
self.add_string(KEY_GENERAL_AUTHOR, author)
|
||||
|
||||
def add_tensor_data_layout(self, layout: str):
|
||||
self.add_string(KEY_LLM_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
|
||||
self.add_string(KEY_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
|
||||
|
||||
def add_url(self, url: str):
|
||||
self.add_string(KEY_GENERAL_URL, url)
|
||||
@@ -612,27 +613,27 @@ class GGUFWriter:
|
||||
|
||||
def add_context_length(self, length: int):
|
||||
self.add_uint32(
|
||||
KEY_LLM_CONTEXT_LENGTH.format(arch=self.arch), length)
|
||||
KEY_CONTEXT_LENGTH.format(arch=self.arch), length)
|
||||
|
||||
def add_embedding_length(self, length: int):
|
||||
self.add_uint32(
|
||||
KEY_LLM_EMBEDDING_LENGTH.format(arch=self.arch), length)
|
||||
KEY_EMBEDDING_LENGTH.format(arch=self.arch), length)
|
||||
|
||||
def add_block_count(self, length: int):
|
||||
self.add_uint32(
|
||||
KEY_LLM_BLOCK_COUNT.format(arch=self.arch), length)
|
||||
KEY_BLOCK_COUNT.format(arch=self.arch), length)
|
||||
|
||||
def add_feed_forward_length(self, length: int):
|
||||
self.add_uint32(
|
||||
KEY_LLM_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
|
||||
KEY_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
|
||||
|
||||
def add_parallel_residual(self, use: bool):
|
||||
self.add_bool(
|
||||
KEY_LLM_USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
|
||||
KEY_USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
|
||||
|
||||
def add_tensor_data_layout(self, layout: str):
|
||||
self.add_string(
|
||||
KEY_LLM_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
|
||||
KEY_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
|
||||
|
||||
def add_head_count(self, count: int):
|
||||
self.add_uint32(
|
||||
|
||||
91
grammars/README.md
Normal file
91
grammars/README.md
Normal file
@@ -0,0 +1,91 @@
|
||||
# GBNF Guide
|
||||
|
||||
GBNF (GGML BNF) is a format for defining [formal grammars](https://en.wikipedia.org/wiki/Formal_grammar) to constrain model outputs in `llama.cpp`. For example, you can use it to force the model to generate valid JSON, or speak only in emojis. GBNF grammars are supported in various ways in `examples/main` and `examples/server`.
|
||||
|
||||
## Background
|
||||
|
||||
[Bakus-Naur Form (BNF)](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form) is a notation for describing the syntax of formal languages like programming languages, file formats, and protocols. GBNF is an extension of BNF that primarily adds a few modern regex-like features.
|
||||
|
||||
## Basics
|
||||
|
||||
In GBNF, we define *production rules* that specify how a *non-terminal* (rule name) can be replaced with sequences of *terminals* (characters, specifically Unicode [code points](https://en.wikipedia.org/wiki/Code_point)) and other non-terminals. The basic format of a production rule is `nonterminal ::= sequence...`.
|
||||
|
||||
## Example
|
||||
|
||||
Before going deeper, let's look at some of the features demonstrated in `grammars/chess.gbnf`, a small chess notation grammar:
|
||||
```
|
||||
# `root` specifies the pattern for the overall output
|
||||
root ::= (
|
||||
# it must start with the characters "1. " followed by a sequence
|
||||
# of characters that match the `move` rule, followed by a space, followed
|
||||
# by another move, and then a newline
|
||||
"1. " move " " move "\n"
|
||||
|
||||
# it's followed by one or more subsequent moves, numbered with one or two digits
|
||||
([1-9] [0-9]? ". " move " " move "\n")+
|
||||
)
|
||||
|
||||
# `move` is an abstract representation, which can be a pawn, nonpawn, or castle.
|
||||
# The `[+#]?` denotes the possibility of checking or mate signs after moves
|
||||
move ::= (pawn | nonpawn | castle) [+#]?
|
||||
|
||||
pawn ::= ...
|
||||
nonpawn ::= ...
|
||||
castle ::= ...
|
||||
```
|
||||
|
||||
## Non-Terminals and Terminals
|
||||
|
||||
Non-terminal symbols (rule names) stand for a pattern of terminals and other non-terminals. They are required to be a dashed lowercase word, like `move`, `castle`, or `check-mate`.
|
||||
|
||||
Terminals are actual characters ([code points](https://en.wikipedia.org/wiki/Code_point)). They can be specified as a sequence like `"1"` or `"O-O"` or as ranges like `[1-9]` or `[NBKQR]`.
|
||||
|
||||
## Characters and character ranges
|
||||
|
||||
Terminals support the full range of Unicode. Unicode characters can be specified directly in the grammar, for example `hiragana ::= [ぁ-ゟ]`, or with escapes: 8-bit (`\xXX`), 16-bit (`\uXXXX`) or 32-bit (`\UXXXXXXXX`).
|
||||
|
||||
Character ranges can be negated with `^`:
|
||||
```
|
||||
single-line ::= [^\n]+ "\n"`
|
||||
```
|
||||
|
||||
## Sequences and Alternatives
|
||||
|
||||
The order of symbols in a sequence matter. For example, in `"1. " move " " move "\n"`, the `"1. "` must come before the first `move`, etc.
|
||||
|
||||
Alternatives, denoted by `|`, give different sequences that are acceptable. For example, in `move ::= pawn | nonpawn | castle`, `move` can be a `pawn` move, a `nonpawn` move, or a `castle`.
|
||||
|
||||
Parentheses `()` can be used to group sequences, which allows for embedding alternatives in a larger rule or applying repetition and optptional symbols (below) to a sequence.
|
||||
|
||||
## Repetition and Optional Symbols
|
||||
|
||||
- `*` after a symbol or sequence means that it can be repeated zero or more times.
|
||||
- `+` denotes that the symbol or sequence should appear one or more times.
|
||||
- `?` makes the preceding symbol or sequence optional.
|
||||
|
||||
## Comments and newlines
|
||||
|
||||
Comments can be specified with `#`:
|
||||
```
|
||||
# defines optional whitspace
|
||||
ws ::= [ \t\n]+
|
||||
```
|
||||
|
||||
Newlines are allowed between rules and between symbols or sequences nested inside parentheses. Additionally, a newline after an alternate marker `|` will continue the current rule, even outside of parentheses.
|
||||
|
||||
## The root rule
|
||||
|
||||
In a full grammar, the `root` rule always defines the starting point of the grammar. In other words, it specifies what the entire output must match.
|
||||
|
||||
```
|
||||
# a grammar for lists
|
||||
root ::= ("- " item)+
|
||||
item ::= [^\n]+ "\n"
|
||||
```
|
||||
|
||||
## Next steps
|
||||
|
||||
This guide provides a brief overview. Check out the GBNF files in this directory (`grammars/`) for examples of full grammars. You can try them out with:
|
||||
```
|
||||
./main -m <model> --grammar-file grammars/some-grammar.gbnf -p 'Some prompt'
|
||||
```
|
||||
15
llama.h
15
llama.h
@@ -247,6 +247,8 @@ extern "C" {
|
||||
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
||||
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
||||
|
||||
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx);
|
||||
|
||||
LLAMA_API int llama_model_n_vocab(const struct llama_model * model);
|
||||
LLAMA_API int llama_model_n_ctx (const struct llama_model * model);
|
||||
LLAMA_API int llama_model_n_embd (const struct llama_model * model);
|
||||
@@ -368,13 +370,6 @@ extern "C" {
|
||||
int n_max_tokens,
|
||||
bool add_bos);
|
||||
|
||||
LLAMA_API int llama_tokenize_bpe(
|
||||
struct llama_context * ctx,
|
||||
const char * text,
|
||||
llama_token * tokens,
|
||||
int n_max_tokens,
|
||||
bool add_bos);
|
||||
|
||||
LLAMA_API int llama_tokenize_with_model(
|
||||
const struct llama_model * model,
|
||||
const char * text,
|
||||
@@ -390,12 +385,6 @@ extern "C" {
|
||||
char * buf,
|
||||
int length);
|
||||
|
||||
LLAMA_API int llama_token_to_str_bpe(
|
||||
const struct llama_context * ctx,
|
||||
llama_token token,
|
||||
char * buf,
|
||||
int length);
|
||||
|
||||
LLAMA_API int llama_token_to_str_with_model(
|
||||
const struct llama_model * model,
|
||||
llama_token token,
|
||||
|
||||
0
scripts/get-wikitext-2.sh
Normal file → Executable file
0
scripts/get-wikitext-2.sh
Normal file → Executable file
@@ -28,7 +28,8 @@ llama_build_and_test_executable(test-sampling.cpp)
|
||||
llama_build_executable(test-tokenizer-0.cpp)
|
||||
llama_test_executable (test-tokenizer-0.llama test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||
llama_build_executable(test-tokenizer-1.cpp)
|
||||
llama_test_executable (test-tokenizer-1.llama test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||
# test-tokenizer-1 requires a BPE vocab. re-enable when we have one.
|
||||
#llama_test_executable (test-tokenizer-1.llama test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||
#llama_test_executable(test-tokenizer-1.aquila test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
||||
llama_build_and_test_executable(test-grammar-parser.cpp)
|
||||
llama_build_and_test_executable(test-llama-grammar.cpp)
|
||||
|
||||
@@ -67,11 +67,13 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
}
|
||||
|
||||
GGML_ASSERT(llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_BPE);
|
||||
|
||||
const int n_vocab = llama_n_vocab(ctx);
|
||||
|
||||
for (int i = 0; i < n_vocab; ++i) {
|
||||
std::string forward = llama_token_to_str_bpe(ctx, i);
|
||||
std::vector<llama_token> tokens = llama_tokenize_bpe(ctx, forward, false);
|
||||
std::string forward = llama_token_to_str(ctx, i);
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, forward, false);
|
||||
if (tokens.size() == 1) {
|
||||
if (i != tokens[0]) {
|
||||
std::string backward = llama_token_to_str(ctx, tokens[0]);
|
||||
@@ -79,16 +81,6 @@ int main(int argc, char **argv) {
|
||||
__func__, i, llama_token_to_str(ctx, i).c_str(), tokens[0], backward.c_str());
|
||||
return 2;
|
||||
}
|
||||
} else {
|
||||
llama_token_type type = llama_token_get_type(ctx, i);
|
||||
if (type == LLAMA_TOKEN_TYPE_UNKNOWN || type == LLAMA_TOKEN_TYPE_CONTROL || type == LLAMA_TOKEN_TYPE_BYTE) {
|
||||
fprintf(stderr, "%s : info: token %d is string %s and bpe returns tokens %s\n",
|
||||
__func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str());
|
||||
} else {
|
||||
fprintf(stderr, "%s : error: token %d is string %s but bpe returns tokens %s\n",
|
||||
__func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str());
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user