mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-26 14:23:22 +02:00
Compare commits
10 Commits
master-50f
...
master-074
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
074bea2eb1 | ||
|
|
5cb63e2493 | ||
|
|
da5303c1ea | ||
|
|
4545539d71 | ||
|
|
edeba28366 | ||
|
|
5c19c70ba6 | ||
|
|
24568371ae | ||
|
|
7392f1cd2c | ||
|
|
ad5fd5b60c | ||
|
|
368d0c8a9e |
@@ -6,7 +6,7 @@ RUN apt-get update && \
|
||||
apt-get install -y build-essential python3 python3-pip
|
||||
|
||||
RUN pip install --upgrade pip setuptools wheel \
|
||||
&& pip install torch torchvision torchaudio sentencepiece numpy
|
||||
&& pip install numpy requests sentencepiece torch tqdm
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
@@ -14,4 +14,4 @@ COPY . .
|
||||
|
||||
RUN make
|
||||
|
||||
ENTRYPOINT ["/app/.devops/tools.sh"]
|
||||
ENTRYPOINT ["/app/.devops/tools.sh"]
|
||||
|
||||
2
Makefile
2
Makefile
@@ -31,7 +31,7 @@ endif
|
||||
#
|
||||
|
||||
CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
|
||||
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
|
||||
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++17 -fPIC
|
||||
LDFLAGS =
|
||||
|
||||
# OS specific
|
||||
|
||||
@@ -11,6 +11,9 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||
- Cache input prompts for faster initialization: https://github.com/ggerganov/llama.cpp/issues/64
|
||||
- Create a `llama.cpp` logo: https://github.com/ggerganov/llama.cpp/issues/105
|
||||
|
||||
**TEMPORARY NOTICE:**
|
||||
If you're updating to the latest master, you will need to regenerate your model files as the format has changed.
|
||||
|
||||
## Description
|
||||
|
||||
The main goal is to run the model using 4-bit quantization on a MacBook
|
||||
@@ -147,7 +150,7 @@ python3 -m pip install torch numpy sentencepiece
|
||||
python3 convert-pth-to-ggml.py models/7B/ 1
|
||||
|
||||
# quantize the model to 4-bits
|
||||
./quantize.sh 7B
|
||||
python3 quantize.py 7B
|
||||
|
||||
# run the inference
|
||||
./main -m ./models/7B/ggml-model-q4_0.bin -n 128
|
||||
|
||||
6
alpaca.sh
Executable file
6
alpaca.sh
Executable file
@@ -0,0 +1,6 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Temporary script - will be removed in the future
|
||||
#
|
||||
|
||||
./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins --top_k 10000 --temp 0.96 --repeat_penalty 1 -t 7
|
||||
@@ -60,7 +60,8 @@ def write_header(fout, hparams, ftype):
|
||||
|
||||
keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
|
||||
values = [
|
||||
0x67676d6c, # magic: ggml in hex
|
||||
0x67676d66, # magic: ggml in hex
|
||||
1, # file version
|
||||
*[hparams[key] for key in keys],
|
||||
hparams["dim"] // hparams["n_heads"], # rot (obsolete)
|
||||
ftype
|
||||
@@ -85,6 +86,7 @@ def write_tokens(fout, tokenizer):
|
||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
|
||||
fout.write(struct.pack("i", len(text)))
|
||||
fout.write(text)
|
||||
fout.write(struct.pack("f", tokenizer.get_score(i)))
|
||||
|
||||
def process_and_write_variables(fout, model, ftype):
|
||||
|
||||
|
||||
122
main.cpp
122
main.cpp
@@ -3,10 +3,12 @@
|
||||
#include "utils.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cinttypes>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
@@ -104,10 +106,24 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
||||
{
|
||||
uint32_t magic;
|
||||
fin.read((char *) &magic, sizeof(magic));
|
||||
if (magic != 0x67676d6c) {
|
||||
if (magic == 0x67676d6c) {
|
||||
fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n",
|
||||
__func__, fname.c_str());
|
||||
return false;
|
||||
}
|
||||
if (magic != 0x67676d66) {
|
||||
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
uint32_t format_version;
|
||||
fin.read((char *) &format_version, sizeof(format_version));
|
||||
|
||||
if (format_version != 1) {
|
||||
fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ")\n",
|
||||
__func__, fname.c_str(), format_version);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
int n_ff = 0;
|
||||
@@ -153,8 +169,12 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
||||
word.resize(len);
|
||||
fin.read((char *) word.data(), len);
|
||||
|
||||
float score;
|
||||
fin.read((char *) &score, sizeof(score));
|
||||
|
||||
vocab.token_to_id[word] = i;
|
||||
vocab.id_to_token[i] = word;
|
||||
vocab.score[i] = score;
|
||||
|
||||
//if (i < 30000) {
|
||||
// fprintf(stderr, "%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
|
||||
@@ -803,7 +823,7 @@ int main(int argc, char ** argv) {
|
||||
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
|
||||
|
||||
std::mt19937 rng(params.seed);
|
||||
if (params.prompt.empty()) {
|
||||
if (params.random_prompt) {
|
||||
params.prompt = gpt_random_prompt(rng);
|
||||
}
|
||||
|
||||
@@ -855,14 +875,18 @@ int main(int argc, char ** argv) {
|
||||
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
||||
if (params.instruct) {
|
||||
params.interactive = true;
|
||||
params.antiprompt = "### Instruction:\n\n";
|
||||
params.antiprompt.push_back("### Instruction:\n\n");
|
||||
}
|
||||
|
||||
// tokenize the reverse prompt
|
||||
std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false);
|
||||
std::vector<std::vector<gpt_vocab::id>> antipromptv_inp;
|
||||
|
||||
for (auto antiprompt : params.antiprompt) {
|
||||
antipromptv_inp.push_back(::llama_tokenize(vocab, antiprompt, false));
|
||||
}
|
||||
|
||||
// enable interactive mode if reverse prompt is specified
|
||||
if (!antiprompt_inp.empty()) {
|
||||
if (antipromptv_inp.size() != 0) {
|
||||
params.interactive = true;
|
||||
}
|
||||
|
||||
@@ -886,13 +910,16 @@ int main(int argc, char ** argv) {
|
||||
|
||||
fprintf(stderr, "%s: interactive mode on.\n", __func__);
|
||||
|
||||
if (antiprompt_inp.size()) {
|
||||
fprintf(stderr, "%s: reverse prompt: '%s'\n", __func__, params.antiprompt.c_str());
|
||||
fprintf(stderr, "%s: number of tokens in reverse prompt = %zu\n", __func__, antiprompt_inp.size());
|
||||
for (int i = 0; i < (int) antiprompt_inp.size(); i++) {
|
||||
fprintf(stderr, "%6d -> '%s'\n", antiprompt_inp[i], vocab.id_to_token.at(antiprompt_inp[i]).c_str());
|
||||
if(antipromptv_inp.size()) {
|
||||
for (size_t apindex = 0; apindex < antipromptv_inp.size(); ++apindex) {
|
||||
auto antiprompt_inp = antipromptv_inp.at(apindex);
|
||||
fprintf(stderr, "%s: reverse prompt: '%s'\n", __func__, params.antiprompt.at(apindex).c_str());
|
||||
fprintf(stderr, "%s: number of tokens in reverse prompt = %zu\n", __func__, antiprompt_inp.size());
|
||||
for (int i = 0; i < (int) antiprompt_inp.size(); i++) {
|
||||
fprintf(stderr, "%6d -> '%s'\n", antiprompt_inp[i], vocab.id_to_token.at(antiprompt_inp[i]).c_str());
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
|
||||
@@ -990,11 +1017,6 @@ int main(int argc, char ** argv) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// reset color to default if we there is no pending user input
|
||||
if (!input_noecho && params.use_color && (int) embd_inp.size() == input_consumed) {
|
||||
printf(ANSI_COLOR_RESET);
|
||||
}
|
||||
}
|
||||
|
||||
// display text
|
||||
@@ -1004,14 +1026,21 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
fflush(stdout);
|
||||
}
|
||||
// reset color to default if we there is no pending user input
|
||||
if (!input_noecho && params.use_color && (int)embd_inp.size() == input_consumed) {
|
||||
printf(ANSI_COLOR_RESET);
|
||||
}
|
||||
|
||||
// in interactive mode, and not currently processing queued inputs;
|
||||
// check if we should prompt the user for more
|
||||
if (params.interactive && embd_inp.size() <= input_consumed) {
|
||||
// check for reverse prompt
|
||||
if (antiprompt_inp.size() && std::equal(antiprompt_inp.rbegin(), antiprompt_inp.rend(), last_n_tokens.rbegin())) {
|
||||
// reverse prompt found
|
||||
is_interacting = true;
|
||||
for (auto antiprompt_inp : antipromptv_inp) {
|
||||
if (antiprompt_inp.size() && std::equal(antiprompt_inp.rbegin(), antiprompt_inp.rend(), last_n_tokens.rbegin())) {
|
||||
// reverse prompt found
|
||||
is_interacting = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (is_interacting) {
|
||||
if (params.instruct) {
|
||||
@@ -1022,47 +1051,36 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// currently being interactive
|
||||
if (params.use_color) printf(ANSI_BOLD ANSI_COLOR_GREEN);
|
||||
std::string buffer;
|
||||
std::string line;
|
||||
bool another_line = true;
|
||||
while (another_line) {
|
||||
fflush(stdout);
|
||||
char buf[256] = {0};
|
||||
int n_read;
|
||||
if (params.use_color) printf(ANSI_BOLD ANSI_COLOR_GREEN);
|
||||
if (scanf("%255[^\n]%n%*c", buf, &n_read) <= 0) {
|
||||
// presumable empty line, consume the newline
|
||||
std::ignore = scanf("%*c");
|
||||
n_read=0;
|
||||
}
|
||||
if (params.use_color) printf(ANSI_COLOR_RESET);
|
||||
|
||||
if (n_read > 0 && buf[n_read-1]=='\\') {
|
||||
another_line = true;
|
||||
buf[n_read-1] = '\n';
|
||||
buf[n_read] = 0;
|
||||
} else {
|
||||
do {
|
||||
std::getline(std::cin, line);
|
||||
if (line.empty() || line.back() != '\\') {
|
||||
another_line = false;
|
||||
buf[n_read] = '\n';
|
||||
buf[n_read+1] = 0;
|
||||
} else {
|
||||
line.pop_back(); // Remove the continue character
|
||||
}
|
||||
buffer += line + '\n'; // Append the line to the result
|
||||
} while (another_line);
|
||||
if (params.use_color) printf(ANSI_COLOR_RESET);
|
||||
|
||||
std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(vocab, buf, false);
|
||||
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
||||
std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(vocab, buffer, false);
|
||||
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
||||
|
||||
if (params.instruct) {
|
||||
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
||||
}
|
||||
|
||||
remaining_tokens -= line_inp.size();
|
||||
|
||||
input_noecho = true; // do not echo this again
|
||||
if (params.instruct) {
|
||||
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
||||
}
|
||||
|
||||
is_interacting = false;
|
||||
remaining_tokens -= line_inp.size();
|
||||
|
||||
input_noecho = true; // do not echo this again
|
||||
}
|
||||
is_interacting = false;
|
||||
}
|
||||
|
||||
// end of text token
|
||||
|
||||
if (embd.back() == EOS_TOKEN_ID) {
|
||||
if (params.interactive) {
|
||||
is_interacting = true;
|
||||
@@ -1071,6 +1089,12 @@ int main(int argc, char ** argv) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
|
||||
if (params.interactive && remaining_tokens <= 0) {
|
||||
remaining_tokens = params.n_predict;
|
||||
is_interacting = true;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined (_WIN32)
|
||||
|
||||
24
quantize.cpp
24
quantize.cpp
@@ -3,6 +3,7 @@
|
||||
#include "utils.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cinttypes>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
@@ -63,12 +64,28 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna
|
||||
{
|
||||
uint32_t magic;
|
||||
finp.read((char *) &magic, sizeof(magic));
|
||||
if (magic != 0x67676d6c) {
|
||||
if (magic == 0x67676d6c) {
|
||||
fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n",
|
||||
__func__, fname_inp.c_str());
|
||||
return false;
|
||||
}
|
||||
if (magic != 0x67676d66) {
|
||||
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
fout.write((char *) &magic, sizeof(magic));
|
||||
|
||||
uint32_t format_version;
|
||||
finp.read((char *) &format_version, sizeof(format_version));
|
||||
|
||||
if (format_version != 1) {
|
||||
fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ")\n",
|
||||
__func__, fname_inp.c_str(), format_version);
|
||||
return false;
|
||||
}
|
||||
|
||||
fout.write((char *) &format_version, sizeof(format_version));
|
||||
}
|
||||
|
||||
llama_hparams hparams;
|
||||
@@ -122,8 +139,13 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna
|
||||
finp.read ((char *) word.data(), len);
|
||||
fout.write((char *) word.data(), len);
|
||||
|
||||
float score;
|
||||
finp.read ((char *) &score, sizeof(score));
|
||||
fout.write((char *) &score, sizeof(score));
|
||||
|
||||
vocab.token_to_id[word] = i;
|
||||
vocab.id_to_token[i] = word;
|
||||
vocab.score[i] = score;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
126
quantize.py
Normal file
126
quantize.py
Normal file
@@ -0,0 +1,126 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""Script to execute the "quantize" script on a given set of models."""
|
||||
|
||||
import subprocess
|
||||
import argparse
|
||||
import glob
|
||||
import sys
|
||||
import os
|
||||
|
||||
|
||||
def main():
|
||||
"""Update the quantize binary name depending on the platform and parse
|
||||
the command line arguments and execute the script.
|
||||
"""
|
||||
|
||||
if "linux" in sys.platform or "darwin" in sys.platform:
|
||||
quantize_script_binary = "quantize"
|
||||
|
||||
elif "win32" in sys.platform or "cygwin" in sys.platform:
|
||||
quantize_script_binary = "quantize.exe"
|
||||
|
||||
else:
|
||||
print("WARNING: Unknown platform. Assuming a UNIX-like OS.\n")
|
||||
quantize_script_binary = "quantize"
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
prog='python3 quantize.py',
|
||||
description='This script quantizes the given models by applying the '
|
||||
f'"{quantize_script_binary}" script on them.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'models', nargs='+', choices=('7B', '13B', '30B', '65B'),
|
||||
help='The models to quantize.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-r', '--remove-16', action='store_true', dest='remove_f16',
|
||||
help='Remove the f16 model after quantizing it.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-m', '--models-path', dest='models_path',
|
||||
default=os.path.join(os.getcwd(), "models"),
|
||||
help='Specify the directory where the models are located.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-q', '--quantize-script-path', dest='quantize_script_path',
|
||||
default=os.path.join(os.getcwd(), quantize_script_binary),
|
||||
help='Specify the path to the "quantize" script.'
|
||||
)
|
||||
|
||||
# TODO: Revise this code
|
||||
# parser.add_argument(
|
||||
# '-t', '--threads', dest='threads', type='int',
|
||||
# default=os.cpu_count(),
|
||||
# help='Specify the number of threads to use to quantize many models at '
|
||||
# 'once. Defaults to os.cpu_count().'
|
||||
# )
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not os.path.isfile(args.quantize_script_path):
|
||||
print(
|
||||
f'The "{quantize_script_binary}" script was not found in the '
|
||||
"current location.\nIf you want to use it from another location, "
|
||||
"set the --quantize-script-path argument from the command line."
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
for model in args.models:
|
||||
# The model is separated in various parts
|
||||
# (ggml-model-f16.bin, ggml-model-f16.bin.0, ggml-model-f16.bin.1...)
|
||||
f16_model_path_base = os.path.join(
|
||||
args.models_path, model, "ggml-model-f16.bin"
|
||||
)
|
||||
|
||||
f16_model_parts_paths = map(
|
||||
lambda filename: os.path.join(f16_model_path_base, filename),
|
||||
glob.glob(f"{f16_model_path_base}*")
|
||||
)
|
||||
|
||||
for f16_model_part_path in f16_model_parts_paths:
|
||||
if not os.path.isfile(f16_model_part_path):
|
||||
print(
|
||||
f"The f16 model {os.path.basename(f16_model_part_path)} "
|
||||
f"was not found in {args.models_path}{os.path.sep}{model}"
|
||||
". If you want to use it from another location, set the "
|
||||
"--models-path argument from the command line."
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
__run_quantize_script(
|
||||
args.quantize_script_path, f16_model_part_path
|
||||
)
|
||||
|
||||
if args.remove_f16:
|
||||
os.remove(f16_model_part_path)
|
||||
|
||||
|
||||
# This was extracted to a top-level function for parallelization, if
|
||||
# implemented. See https://github.com/ggerganov/llama.cpp/pull/222/commits/f8db3d6cd91bf1a1342db9d29e3092bc12dd783c#r1140496406
|
||||
|
||||
def __run_quantize_script(script_path, f16_model_part_path):
|
||||
"""Run the quantize script specifying the path to it and the path to the
|
||||
f16 model to quantize.
|
||||
"""
|
||||
|
||||
new_quantized_model_path = f16_model_part_path.replace("f16", "q4_0")
|
||||
subprocess.run(
|
||||
[script_path, f16_model_part_path, new_quantized_model_path, "2"],
|
||||
check=True
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
main()
|
||||
|
||||
except subprocess.CalledProcessError:
|
||||
print("\nAn error ocurred while trying to quantize the models.")
|
||||
sys.exit(1)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
sys.exit(0)
|
||||
|
||||
else:
|
||||
print("\nSuccesfully quantized all models.")
|
||||
15
quantize.sh
15
quantize.sh
@@ -1,15 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
if ! [[ "$1" =~ ^[0-9]{1,2}B$ ]]; then
|
||||
echo
|
||||
echo "Usage: quantize.sh 7B|13B|30B|65B [--remove-f16]"
|
||||
echo
|
||||
exit 1
|
||||
fi
|
||||
|
||||
for i in `ls models/$1/ggml-model-f16.bin*`; do
|
||||
./quantize "$i" "${i/f16/q4_0}" 2
|
||||
if [[ "$2" == "--remove-f16" ]]; then
|
||||
rm "$i"
|
||||
fi
|
||||
done
|
||||
177
utils.cpp
177
utils.cpp
@@ -6,6 +6,7 @@
|
||||
#include <regex>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <queue>
|
||||
#include <string>
|
||||
#include <math.h>
|
||||
|
||||
@@ -70,12 +71,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
} else if (arg == "--color") {
|
||||
params.use_color = true;
|
||||
} else if (arg == "-r" || arg == "--reverse-prompt") {
|
||||
params.antiprompt = argv[++i];
|
||||
params.antiprompt.push_back(argv[++i]);
|
||||
} else if (arg == "--ignore-eos") {
|
||||
params.ignore_eos = true;
|
||||
} else if (arg == "-h" || arg == "--help") {
|
||||
gpt_print_usage(argc, argv, params);
|
||||
exit(0);
|
||||
} else if (arg == "--random-prompt") {
|
||||
params.random_prompt = true;
|
||||
} else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
gpt_print_usage(argc, argv, params);
|
||||
@@ -94,12 +97,14 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
fprintf(stderr, " -i, --interactive run in interactive mode\n");
|
||||
fprintf(stderr, " -ins, --instruct run in instruction mode (use with Alpaca models)\n");
|
||||
fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n");
|
||||
fprintf(stderr, " in interactive mode, poll user input upon seeing PROMPT\n");
|
||||
fprintf(stderr, " in interactive mode, poll user input upon seeing PROMPT (can be\n");
|
||||
fprintf(stderr, " specified more than once for multiple prompts).\n");
|
||||
fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n");
|
||||
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n");
|
||||
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||
fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
|
||||
fprintf(stderr, " prompt to start generation with (default: random)\n");
|
||||
fprintf(stderr, " prompt to start generation with (default: empty)\n");
|
||||
fprintf(stderr, " --random-prompt start with a randomized prompt.\n");
|
||||
fprintf(stderr, " -f FNAME, --file FNAME\n");
|
||||
fprintf(stderr, " prompt file to start generation.\n");
|
||||
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict);
|
||||
@@ -290,58 +295,146 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
|
||||
return tokens;
|
||||
}
|
||||
|
||||
// TODO: Calculate this constant from the vocabulary
|
||||
#define MAX_TOKEN_LEN 18
|
||||
// SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece
|
||||
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) {
|
||||
std::vector<gpt_vocab::id> res;
|
||||
std::vector<int> score;
|
||||
std::vector<gpt_vocab::id> prev;
|
||||
int len = text.length();
|
||||
static size_t utf8_len(char src) {
|
||||
const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
|
||||
uint8_t highbits = static_cast<uint8_t>(src) >> 4;
|
||||
return lookup[highbits];
|
||||
}
|
||||
|
||||
score.resize(len + 1);
|
||||
prev.resize(len + 1);
|
||||
struct llama_sp_symbol {
|
||||
using index = int;
|
||||
index prev;
|
||||
index next;
|
||||
std::string_view text;
|
||||
};
|
||||
|
||||
// Forward pass
|
||||
for (int i = 0; i < len; i++) {
|
||||
int max_len = std::min(len - i, MAX_TOKEN_LEN);
|
||||
for (int sub_len = 1; sub_len <= max_len; sub_len++) {
|
||||
auto sub = text.substr(i, sub_len);
|
||||
auto token = vocab.token_to_id.find(sub);
|
||||
if (token != vocab.token_to_id.end()) {
|
||||
int token_score = sub.length() * sub.length();
|
||||
int local_score = score[i] + token_score;
|
||||
int next = i + sub_len;
|
||||
if (score[next] < local_score) {
|
||||
score[next] = local_score;
|
||||
prev[next] = (*token).second;
|
||||
struct llama_sp_bigram {
|
||||
struct comparator {
|
||||
bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) {
|
||||
return (l.score < r.score) || (l.score == r.score && l.left > r.left);
|
||||
}
|
||||
};
|
||||
using queue_storage = std::vector<llama_sp_bigram>;
|
||||
using queue = std::priority_queue<llama_sp_bigram, queue_storage, comparator>;
|
||||
llama_sp_symbol::index left;
|
||||
llama_sp_symbol::index right;
|
||||
float score;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
struct llama_tokenizer {
|
||||
llama_tokenizer(const gpt_vocab & vocab): vocab_(vocab) {}
|
||||
|
||||
void tokenize(std::string_view text, std::vector<gpt_vocab::id> & output) {
|
||||
// split string into utf8 chars
|
||||
int index = 0;
|
||||
while (!text.empty()) {
|
||||
llama_sp_symbol sym;
|
||||
size_t char_len = std::min(text.size(), utf8_len(text.data()[0]));
|
||||
sym.text = std::string_view(text.data(), char_len);
|
||||
sym.prev = index - 1;
|
||||
text.remove_prefix(char_len);
|
||||
sym.next = text.empty() ? -1 : index + 1;
|
||||
index++;
|
||||
symbols_.emplace_back(std::move(sym));
|
||||
}
|
||||
|
||||
// seed the work queue with all possible 2-character tokens.
|
||||
for (size_t i = 1; i < symbols_.size(); ++i) {
|
||||
try_add_bigram(i - 1, i);
|
||||
}
|
||||
|
||||
// keep substituting the highest frequency pairs for as long as we can.
|
||||
while (!work_queue_.empty()) {
|
||||
auto bigram = work_queue_.top();
|
||||
work_queue_.pop();
|
||||
|
||||
auto & left_sym = symbols_[bigram.left];
|
||||
auto & right_sym = symbols_[bigram.right];
|
||||
|
||||
// if one of the symbols already got merged, skip it.
|
||||
if (left_sym.text.empty() || right_sym.text.empty() ||
|
||||
left_sym.text.size() + right_sym.text.size() != bigram.size) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// merge the right sym into the left one
|
||||
left_sym.text = std::string_view(left_sym.text.data(), left_sym.text.size() + right_sym.text.size());
|
||||
right_sym.text = std::string_view("");
|
||||
|
||||
// remove the right sym from the chain
|
||||
left_sym.next = right_sym.next;
|
||||
if (right_sym.next >= 0) {
|
||||
symbols_[right_sym.next].prev = bigram.left;
|
||||
}
|
||||
|
||||
// find more substitutions
|
||||
try_add_bigram(left_sym.prev, bigram.left);
|
||||
try_add_bigram(bigram.left, left_sym.next);
|
||||
}
|
||||
|
||||
for (int i = 0; i != -1; i = symbols_[i].next) {
|
||||
auto& symbol = symbols_[i];
|
||||
auto token = vocab_.token_to_id.find(std::string(symbol.text));
|
||||
|
||||
if (token == vocab_.token_to_id.end()) {
|
||||
// output any symbols that did not form tokens as bytes.
|
||||
for (int j = 0; j < symbol.text.size(); ++j) {
|
||||
gpt_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
|
||||
output.push_back(token_id);
|
||||
}
|
||||
} else {
|
||||
output.push_back((*token).second);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Backward pass
|
||||
int i = len;
|
||||
while (i > 0) {
|
||||
gpt_vocab::id token_id = prev[i];
|
||||
if (token_id == 0) {
|
||||
// TODO: Return error or something more meaningful
|
||||
printf("failed to tokenize string!\n");
|
||||
break;
|
||||
private:
|
||||
void try_add_bigram(int left, int right) {
|
||||
if (left == -1 || right == -1) {
|
||||
return;
|
||||
}
|
||||
res.push_back(token_id);
|
||||
auto token = (*vocab.id_to_token.find(token_id)).second;
|
||||
i -= token.length();
|
||||
|
||||
std::string_view text(symbols_[left].text.data(), symbols_[left].text.size() + symbols_[right].text.size());
|
||||
auto token = vocab_.token_to_id.find(std::string(text));
|
||||
|
||||
if (token == vocab_.token_to_id.end()) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto score = vocab_.score.find((*token).second);
|
||||
|
||||
if (score == vocab_.score.end()) {
|
||||
return;
|
||||
}
|
||||
|
||||
llama_sp_bigram bigram;
|
||||
bigram.left = left;
|
||||
bigram.right = right;
|
||||
bigram.score = (*score).second;
|
||||
bigram.size = text.size();
|
||||
work_queue_.push(bigram);
|
||||
}
|
||||
|
||||
const gpt_vocab & vocab_;
|
||||
std::vector<llama_sp_symbol> symbols_;
|
||||
llama_sp_bigram::queue work_queue_;
|
||||
};
|
||||
|
||||
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, std::string_view text, bool bos) {
|
||||
llama_tokenizer tokenizer(vocab);
|
||||
std::vector<gpt_vocab::id> output;
|
||||
|
||||
if (text.size() == 0) {
|
||||
return output;
|
||||
}
|
||||
|
||||
if (bos) {
|
||||
res.push_back(1); // TODO: replace with vocab.bos
|
||||
output.push_back(1);
|
||||
}
|
||||
|
||||
// Pieces are in reverse order so correct that
|
||||
std::reverse(res.begin(), res.end());
|
||||
|
||||
return res;
|
||||
tokenizer.tokenize(text, output);
|
||||
return output;
|
||||
}
|
||||
|
||||
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
|
||||
|
||||
9
utils.h
9
utils.h
@@ -30,13 +30,15 @@ struct gpt_params {
|
||||
|
||||
std::string model = "models/lamma-7B/ggml-model.bin"; // model path
|
||||
std::string prompt = "";
|
||||
std::string antiprompt = ""; // string upon seeing which more user input is prompted
|
||||
|
||||
bool random_prompt = false;
|
||||
|
||||
bool use_color = false; // use color to distinguish generations and inputs
|
||||
|
||||
bool interactive = false; // interactive mode
|
||||
bool interactive_start = false; // reverse prompt immediately
|
||||
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
|
||||
bool instruct = false; // instruction mode (used for Alpaca models)
|
||||
|
||||
bool ignore_eos = false; // do not stop generating after eos
|
||||
};
|
||||
|
||||
@@ -56,6 +58,7 @@ struct gpt_vocab {
|
||||
|
||||
std::map<token, id> token_to_id;
|
||||
std::map<id, token> id_to_token;
|
||||
std::map<id, float> score;
|
||||
};
|
||||
|
||||
void replace(std::string & str, const std::string & needle, const std::string & replacement);
|
||||
@@ -77,7 +80,7 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
|
||||
|
||||
// TODO: this is probably wrong, but I cannot figure out how this tokenizer works ..
|
||||
// ref: https://github.com/google/sentencepiece
|
||||
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos);
|
||||
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, std::string_view text, bool bos);
|
||||
|
||||
// load the tokens from encoder.json
|
||||
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
|
||||
|
||||
Reference in New Issue
Block a user