Compare commits

...

12 Commits

Author SHA1 Message Date
cocktailpeanut
da5303c1ea bugfix: default should not be interactive (#304) 2023-03-19 23:44:20 +02:00
Georgi Gerganov
4545539d71 Rename script 2023-03-19 21:58:51 +02:00
Georgi Gerganov
edeba28366 Add temporary helper script for Alpaca chat 2023-03-19 21:57:48 +02:00
Rickey Bowers Jr
5c19c70ba6 fix coloring of last n_batch of prompt, and refactor line input (#221)
* fix coloring of last `n_batch` of prompt, and refactor line input
* forgot the newline that needs to be sent to the model
* (per #283) try to force flush of color reset in SIGINT handler
2023-03-19 19:44:30 +00:00
tjohnman
24568371ae Support for multiple reverse prompts. (#299)
Co-authored-by: Johnman <>
Co-authored-by: Johnman <tjohnman@github>
2023-03-19 21:33:06 +02:00
Suaj Carrot
7392f1cd2c Improved quantize script (#222)
* Improved quantize script

I improved the quantize script by adding error handling and allowing to select many models for quantization at once in the command line. I also converted it to Python for generalization as well as extensibility.

* Fixes and improvements based on Matt's observations

Fixed and improved many things in the script based on the reviews made by @mattsta. The parallelization suggestion is still to be revised, but code for it was still added (commented).

* Small fixes to the previous commit

* Corrected to use the original glob pattern

The original Bash script uses a glob pattern to match files that have endings such as ...bin.0, ...bin.1, etc. That has been translated correctly to Python now.

* Added support for Windows and updated README to use this script

New code to set the name of the quantize script binary depending on the platform has been added (quantize.exe if working on Windows) and the README.md file has been updated to use this script instead of the Bash one.

* Fixed a typo and removed shell=True in the subprocess.run call

Fixed a typo regarding the new filenames of the quantized models and removed the shell=True parameter in the subprocess.run call as it was conflicting with the list of parameters.

* Corrected previous commit

* Small tweak: changed the name of the program in argparse

This was making the automatic help message to be suggesting the program's usage as being literally "$ Quantization Script [arguments]". It should now be something like "$ python3 quantize.py [arguments]".
2023-03-19 20:38:44 +02:00
tjohnman
ad5fd5b60c Make prompt randomization optional. (#300)
Co-authored-by: Johnman <>
2023-03-19 20:36:19 +02:00
tjohnman
368d0c8a9e Respect the maximum number of tokens in interactive. (#298)
Co-authored-by: Johnman <johnman@github>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-03-19 20:31:17 +02:00
slaren
50fae10d03 Add --ignore-eos parameter (#181)
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-03-19 20:22:48 +02:00
Qingyou Meng
084e2f0ec0 interactive mode: print '\n' in sigint_handler, this flush stdout thus ensure color reset. (#283) 2023-03-19 20:10:00 +02:00
Erik Scholz
0b366e7357 Command line switch to use F16 for memory_k and memory_v (refactor of #154) (#294)
* Use F16 for memory_k and memory_v

* add command line switch to use f16 instead of f32 for memory k+v

---------

Co-authored-by: Ty Everett <ty@tyweb.us>
2023-03-19 19:57:00 +02:00
Georgi Gerganov
160bfb217d Update hot topics to mention Alpaca support 2023-03-19 19:51:55 +02:00
7 changed files with 222 additions and 75 deletions

View File

@@ -7,7 +7,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
**Hot topics:**
- RMSNorm implementation / fixes: https://github.com/ggerganov/llama.cpp/issues/173
- [Added Alpaca support](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
- Cache input prompts for faster initialization: https://github.com/ggerganov/llama.cpp/issues/64
- Create a `llama.cpp` logo: https://github.com/ggerganov/llama.cpp/issues/105
@@ -147,7 +147,7 @@ python3 -m pip install torch numpy sentencepiece
python3 convert-pth-to-ggml.py models/7B/ 1
# quantize the model to 4-bits
./quantize.sh 7B
python3 quantize.py 7B
# run the inference
./main -m ./models/7B/ggml-model-q4_0.bin -n 128

6
alpaca.sh Executable file
View File

@@ -0,0 +1,6 @@
#!/bin/bash
#
# Temporary script - will be removed in the future
#
./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins --top_k 10000 --temp 0.96 --repeat_penalty 1 -t 7

123
main.cpp
View File

@@ -7,6 +7,7 @@
#include <cstdio>
#include <cstring>
#include <fstream>
#include <iostream>
#include <map>
#include <string>
#include <vector>
@@ -27,6 +28,8 @@
#define ANSI_COLOR_RESET "\x1b[0m"
#define ANSI_BOLD "\x1b[1m"
static const int EOS_TOKEN_ID = 2;
// determine number of model parts based on the dimension
static const std::map<int, int> LLAMA_N_PARTS = {
{ 4096, 1 },
@@ -86,7 +89,7 @@ struct llama_model {
};
// load the model's weights from a file
bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) {
bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx, ggml_type memory_type = GGML_TYPE_F32) {
fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
std::vector<char> f_buf(1024*1024);
@@ -207,8 +210,8 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v
ctx_size += (5 + 10*n_layer)*256; // object overhead
@@ -293,8 +296,8 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
const int n_mem = n_layer*n_ctx;
const int n_elements = n_embd*n_mem;
model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
model.memory_k = ggml_new_tensor_1d(ctx, memory_type, n_elements);
model.memory_v = ggml_new_tensor_1d(ctx, memory_type, n_elements);
const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
@@ -747,6 +750,7 @@ static bool is_interacting = false;
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
void sigint_handler(int signo) {
printf(ANSI_COLOR_RESET);
printf("\n"); // this also force flush stdout.
if (signo == SIGINT) {
if (!is_interacting) {
is_interacting=true;
@@ -800,7 +804,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.prompt.empty()) {
if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng);
}
@@ -814,8 +818,9 @@ int main(int argc, char ** argv) {
// load the model
{
const ggml_type memory_type = params.memory_f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
const int64_t t_start_us = ggml_time_us();
if (!llama_model_load(params.model, model, vocab, params.n_ctx)) {
if (!llama_model_load(params.model, model, vocab, params.n_ctx, memory_type)) {
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
return 1;
}
@@ -851,14 +856,18 @@ int main(int argc, char ** argv) {
// in instruct mode, we inject a prefix and a suffix to each input by the user
if (params.instruct) {
params.interactive = true;
params.antiprompt = "### Instruction:\n\n";
params.antiprompt.push_back("### Instruction:\n\n");
}
// tokenize the reverse prompt
std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false);
std::vector<std::vector<gpt_vocab::id>> antipromptv_inp;
for (auto antiprompt : params.antiprompt) {
antipromptv_inp.push_back(::llama_tokenize(vocab, antiprompt, false));
}
// enable interactive mode if reverse prompt is specified
if (!antiprompt_inp.empty()) {
if (antipromptv_inp.size() != 0) {
params.interactive = true;
}
@@ -882,13 +891,16 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: interactive mode on.\n", __func__);
if (antiprompt_inp.size()) {
fprintf(stderr, "%s: reverse prompt: '%s'\n", __func__, params.antiprompt.c_str());
fprintf(stderr, "%s: number of tokens in reverse prompt = %zu\n", __func__, antiprompt_inp.size());
for (int i = 0; i < (int) antiprompt_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", antiprompt_inp[i], vocab.id_to_token.at(antiprompt_inp[i]).c_str());
if(antipromptv_inp.size()) {
for (size_t apindex = 0; apindex < antipromptv_inp.size(); ++apindex) {
auto antiprompt_inp = antipromptv_inp.at(apindex);
fprintf(stderr, "%s: reverse prompt: '%s'\n", __func__, params.antiprompt.at(apindex).c_str());
fprintf(stderr, "%s: number of tokens in reverse prompt = %zu\n", __func__, antiprompt_inp.size());
for (int i = 0; i < (int) antiprompt_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", antiprompt_inp[i], vocab.id_to_token.at(antiprompt_inp[i]).c_str());
}
fprintf(stderr, "\n");
}
fprintf(stderr, "\n");
}
}
fprintf(stderr, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
@@ -954,6 +966,11 @@ int main(int argc, char ** argv) {
{
const int64_t t_start_sample_us = ggml_time_us();
if (params.ignore_eos) {
// set the logit of the eos token to zero to avoid sampling it
logits[logits.size() - n_vocab + EOS_TOKEN_ID] = 0;
}
id = llama_sample_top_p_top_k(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens, repeat_penalty, top_k, top_p, temp, rng);
last_n_tokens.erase(last_n_tokens.begin());
@@ -981,11 +998,6 @@ int main(int argc, char ** argv) {
break;
}
}
// reset color to default if we there is no pending user input
if (!input_noecho && params.use_color && (int) embd_inp.size() == input_consumed) {
printf(ANSI_COLOR_RESET);
}
}
// display text
@@ -995,14 +1007,21 @@ int main(int argc, char ** argv) {
}
fflush(stdout);
}
// reset color to default if we there is no pending user input
if (!input_noecho && params.use_color && (int)embd_inp.size() == input_consumed) {
printf(ANSI_COLOR_RESET);
}
// in interactive mode, and not currently processing queued inputs;
// check if we should prompt the user for more
if (params.interactive && embd_inp.size() <= input_consumed) {
// check for reverse prompt
if (antiprompt_inp.size() && std::equal(antiprompt_inp.rbegin(), antiprompt_inp.rend(), last_n_tokens.rbegin())) {
// reverse prompt found
is_interacting = true;
for (auto antiprompt_inp : antipromptv_inp) {
if (antiprompt_inp.size() && std::equal(antiprompt_inp.rbegin(), antiprompt_inp.rend(), last_n_tokens.rbegin())) {
// reverse prompt found
is_interacting = true;
break;
}
}
if (is_interacting) {
if (params.instruct) {
@@ -1013,47 +1032,37 @@ int main(int argc, char ** argv) {
}
// currently being interactive
if (params.use_color) printf(ANSI_BOLD ANSI_COLOR_GREEN);
std::string buffer;
std::string line;
bool another_line = true;
while (another_line) {
fflush(stdout);
char buf[256] = {0};
int n_read;
if (params.use_color) printf(ANSI_BOLD ANSI_COLOR_GREEN);
if (scanf("%255[^\n]%n%*c", buf, &n_read) <= 0) {
// presumable empty line, consume the newline
std::ignore = scanf("%*c");
n_read=0;
}
if (params.use_color) printf(ANSI_COLOR_RESET);
if (n_read > 0 && buf[n_read-1]=='\\') {
another_line = true;
buf[n_read-1] = '\n';
buf[n_read] = 0;
} else {
do {
std::getline(std::cin, line);
if (line.empty() || line.back() != '\\') {
another_line = false;
buf[n_read] = '\n';
buf[n_read+1] = 0;
} else {
line.pop_back(); // Remove the continue character
}
buffer += line + '\n'; // Append the line to the result
} while (another_line);
if (params.use_color) printf(ANSI_COLOR_RESET);
std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(vocab, buf, false);
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(vocab, buffer, false);
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
if (params.instruct) {
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
}
remaining_tokens -= line_inp.size();
input_noecho = true; // do not echo this again
if (params.instruct) {
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
}
is_interacting = false;
remaining_tokens -= line_inp.size();
input_noecho = true; // do not echo this again
}
is_interacting = false;
}
// end of text token
if (embd.back() == 2) {
if (embd.back() == EOS_TOKEN_ID) {
if (params.interactive) {
is_interacting = true;
} else {
@@ -1061,6 +1070,12 @@ int main(int argc, char ** argv) {
break;
}
}
// In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
if (params.interactive && remaining_tokens <= 0) {
remaining_tokens = params.n_predict;
is_interacting = true;
}
}
#if defined (_WIN32)

126
quantize.py Normal file
View File

@@ -0,0 +1,126 @@
#!/usr/bin/env python3
"""Script to execute the "quantize" script on a given set of models."""
import subprocess
import argparse
import glob
import sys
import os
def main():
"""Update the quantize binary name depending on the platform and parse
the command line arguments and execute the script.
"""
if "linux" in sys.platform or "darwin" in sys.platform:
quantize_script_binary = "quantize"
elif "win32" in sys.platform or "cygwin" in sys.platform:
quantize_script_binary = "quantize.exe"
else:
print("WARNING: Unknown platform. Assuming a UNIX-like OS.\n")
quantize_script_binary = "quantize"
parser = argparse.ArgumentParser(
prog='python3 quantize.py',
description='This script quantizes the given models by applying the '
f'"{quantize_script_binary}" script on them.'
)
parser.add_argument(
'models', nargs='+', choices=('7B', '13B', '30B', '65B'),
help='The models to quantize.'
)
parser.add_argument(
'-r', '--remove-16', action='store_true', dest='remove_f16',
help='Remove the f16 model after quantizing it.'
)
parser.add_argument(
'-m', '--models-path', dest='models_path',
default=os.path.join(os.getcwd(), "models"),
help='Specify the directory where the models are located.'
)
parser.add_argument(
'-q', '--quantize-script-path', dest='quantize_script_path',
default=os.path.join(os.getcwd(), quantize_script_binary),
help='Specify the path to the "quantize" script.'
)
# TODO: Revise this code
# parser.add_argument(
# '-t', '--threads', dest='threads', type='int',
# default=os.cpu_count(),
# help='Specify the number of threads to use to quantize many models at '
# 'once. Defaults to os.cpu_count().'
# )
args = parser.parse_args()
if not os.path.isfile(args.quantize_script_path):
print(
f'The "{quantize_script_binary}" script was not found in the '
"current location.\nIf you want to use it from another location, "
"set the --quantize-script-path argument from the command line."
)
sys.exit(1)
for model in args.models:
# The model is separated in various parts
# (ggml-model-f16.bin, ggml-model-f16.bin.0, ggml-model-f16.bin.1...)
f16_model_path_base = os.path.join(
args.models_path, model, "ggml-model-f16.bin"
)
f16_model_parts_paths = map(
lambda filename: os.path.join(f16_model_path_base, filename),
glob.glob(f"{f16_model_path_base}*")
)
for f16_model_part_path in f16_model_parts_paths:
if not os.path.isfile(f16_model_part_path):
print(
f"The f16 model {os.path.basename(f16_model_part_path)} "
f"was not found in {args.models_path}{os.path.sep}{model}"
". If you want to use it from another location, set the "
"--models-path argument from the command line."
)
sys.exit(1)
__run_quantize_script(
args.quantize_script_path, f16_model_part_path
)
if args.remove_f16:
os.remove(f16_model_part_path)
# This was extracted to a top-level function for parallelization, if
# implemented. See https://github.com/ggerganov/llama.cpp/pull/222/commits/f8db3d6cd91bf1a1342db9d29e3092bc12dd783c#r1140496406
def __run_quantize_script(script_path, f16_model_part_path):
"""Run the quantize script specifying the path to it and the path to the
f16 model to quantize.
"""
new_quantized_model_path = f16_model_part_path.replace("f16", "q4_0")
subprocess.run(
[script_path, f16_model_part_path, new_quantized_model_path, "2"],
check=True
)
if __name__ == "__main__":
try:
main()
except subprocess.CalledProcessError:
print("\nAn error ocurred while trying to quantize the models.")
sys.exit(1)
except KeyboardInterrupt:
sys.exit(0)
else:
print("\nSuccesfully quantized all models.")

View File

@@ -1,15 +0,0 @@
#!/usr/bin/env bash
if ! [[ "$1" =~ ^[0-9]{1,2}B$ ]]; then
echo
echo "Usage: quantize.sh 7B|13B|30B|65B [--remove-f16]"
echo
exit 1
fi
for i in `ls models/$1/ggml-model-f16.bin*`; do
./quantize "$i" "${i/f16/q4_0}" 2
if [[ "$2" == "--remove-f16" ]]; then
rm "$i"
fi
done

View File

@@ -49,6 +49,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.top_k = std::stoi(argv[++i]);
} else if (arg == "-c" || arg == "--ctx_size") {
params.n_ctx = std::stoi(argv[++i]);
} else if (arg == "--memory_f16") {
params.memory_f16 = true;
} else if (arg == "--top_p") {
params.top_p = std::stof(argv[++i]);
} else if (arg == "--temp") {
@@ -68,10 +70,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
} else if (arg == "--color") {
params.use_color = true;
} else if (arg == "-r" || arg == "--reverse-prompt") {
params.antiprompt = argv[++i];
params.antiprompt.push_back(argv[++i]);
} else if (arg == "--ignore-eos") {
params.ignore_eos = true;
} else if (arg == "-h" || arg == "--help") {
gpt_print_usage(argc, argv, params);
exit(0);
} else if (arg == "--random-prompt") {
params.random_prompt = true;
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
gpt_print_usage(argc, argv, params);
@@ -90,12 +96,14 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " -i, --interactive run in interactive mode\n");
fprintf(stderr, " -ins, --instruct run in instruction mode (use with Alpaca models)\n");
fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n");
fprintf(stderr, " in interactive mode, poll user input upon seeing PROMPT\n");
fprintf(stderr, " in interactive mode, poll user input upon seeing PROMPT (can be\n");
fprintf(stderr, " specified more than once for multiple prompts).\n");
fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n");
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n");
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
fprintf(stderr, " prompt to start generation with (default: random)\n");
fprintf(stderr, " prompt to start generation with (default: empty)\n");
fprintf(stderr, " --random-prompt start with a randomized prompt.\n");
fprintf(stderr, " -f FNAME, --file FNAME\n");
fprintf(stderr, " prompt file to start generation.\n");
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict);
@@ -104,6 +112,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " --repeat_last_n N last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n);
fprintf(stderr, " --repeat_penalty N penalize repeat sequence of tokens (default: %.1f)\n", params.repeat_penalty);
fprintf(stderr, " -c N, --ctx_size N size of the prompt context (default: %d)\n", params.n_ctx);
fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating\n");
fprintf(stderr, " --memory_f16 use f16 instead of f32 for memory key+value\n");
fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp);
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stderr, " -m FNAME, --model FNAME\n");

View File

@@ -18,6 +18,7 @@ struct gpt_params {
int32_t n_predict = 128; // new tokens to predict
int32_t repeat_last_n = 64; // last n tokens to penalize
int32_t n_ctx = 512; //context size
bool memory_f16 = false; // use f16 instead of f32 for memory kv
// sampling parameters
int32_t top_k = 40;
@@ -29,12 +30,16 @@ struct gpt_params {
std::string model = "models/lamma-7B/ggml-model.bin"; // model path
std::string prompt = "";
std::string antiprompt = ""; // string upon seeing which more user input is prompted
bool random_prompt = false;
bool use_color = false; // use color to distinguish generations and inputs
bool interactive = false; // interactive mode
bool interactive_start = false; // reverse prompt immediately
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
bool instruct = false; // instruction mode (used for Alpaca models)
bool ignore_eos = false; // do not stop generating after eos
};
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);