Compare commits

..

11 Commits

Author SHA1 Message Date
Luciano
8d4a855c24 Add embedding mode with arg flag. Currently working (#282)
* working but ugly

* add arg flag, not working on embedding mode

* typo

* Working! Thanks to @nullhook

* make params argument instead of hardcoded boolean. remove useless time check

* start doing the instructions but not finished. This probably doesnt compile

* Embeddings extraction support

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-03-24 17:05:13 +02:00
Georgi Gerganov
b6b268d441 Add link to Roadmap discussion 2023-03-24 09:13:35 +02:00
Georgi Gerganov
3cd8dde0d1 Revert "Fix memory allocation issues and seg faults"
This reverts commit 4870e455b3.

Will provide the correct fix later
2023-03-24 06:22:28 +02:00
Georgi Gerganov
4870e455b3 Fix memory allocation issues and seg faults 2023-03-24 00:11:53 +02:00
Georgi Gerganov
483bab2e3d Avoid the transposed X branch in the Z = X * Y matrix multiplication (#439)
Should make results reproducible for different number of threads and batch sizes
2023-03-23 23:22:01 +02:00
Jed Fox
404e1da38e Fix quantize script not finding models in parent directory (#428) 2023-03-23 22:42:52 +02:00
Georgi Gerganov
4cc053b6d5 Remove oboslete command from Docker script 2023-03-23 22:39:44 +02:00
Georgi Gerganov
0ba5a3a9a5 Obsolete 2023-03-23 22:32:21 +02:00
rabidcopy
2e17dfd80a Replace EOS with newline to prevent context/memory being flushed by EOS in interactive mode (#333)
* Improve interactive mode's coherence after EOS

Aims to improve coherence and ability to resume the interactive session when the user is given input back after an end of text token is reached.
Not sure what token 13 is or why it seems to help. See conversation for examples.

* Make newline token a constant

* dynamically determine newline token

* relocate previous newline token const

* cleanup whitespace

* print a new line on end of text in interactive

this may need to be looked into further when not using a reverse prompt

* only print manual newline with reverse prompt

fix formatting of reverse prompts so they don't end up at the end of the current line while not introducing unnecessary new lines otherwise

* alternate approach to replace end of text tokens

* Inject the reverse prompt again after eos in interactive mode

* tokenize reverse prompt when needed

makes this PR compatible with https://github.com/ggerganov/llama.cpp/pull/330

* tokenize and inject only first reverse prompt

thanks to tjohnman

* tokenize first reverse prompt once

* add newline token

* add newline token

* tokenize/inject reverse prompt for refactor

this doesn't seem right though

* tokenize nothing for antiprompt if no reverse

* Update main.cpp

* Update main.cpp

* tokenize and inject reverse prompt as needed

this doesn't seem to work if the reverse prompt is tokenized outside earlier on

* not needed

* remove newline token

* remove newline token

* tokenize newline token

* add space to comment

* Update main.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Slaren <2141330+slaren@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-03-23 22:22:47 +02:00
Timmy Knight
20a1a4e09c Fix GPTQ converter (#423)
* Fix GPTQ converter

* Fix comment

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-03-23 22:18:13 +02:00
nusu-github
ad072fc5ad Generate library with CMake (#430)
* Generate library with CMake

BUILD_SHARED_LIBS to allow llama library to be generated.

* Turn ON PIC when BUILD_SHARED_LIBS is ON
2023-03-23 21:16:48 +01:00
11 changed files with 125 additions and 108 deletions

View File

@@ -16,11 +16,7 @@ elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then
./quantize $arg2
elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then
./main $arg2
elif [[ $arg1 == '--download' || $arg1 == '-d' ]]; then
python3 ./download-pth.py $arg2
elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
echo "Downloading model..."
python3 ./download-pth.py "$1" "$2"
echo "Converting PTH to GGML..."
for i in `ls $1/$2/ggml-model-f16.bin*`; do
if [ -f "${i/f16/q4_0}" ]; then
@@ -39,8 +35,6 @@ else
echo " ex: \"/models/7B/\" 1"
echo " --quantize (-q): Optimize with quantization process ggml"
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
echo " --download (-d): Download original llama model from CDN: https://agi.gpt4.org/llama/"
echo " ex: \"/models/\" 7B"
echo " --all-in-one (-a): Execute --download, --convert & --quantize"
echo " --all-in-one (-a): Execute --convert & --quantize"
echo " ex: \"/models/\" 7B"
fi

View File

@@ -218,6 +218,9 @@ add_library(utils OBJECT
target_include_directories(utils PUBLIC .)
target_compile_features(utils PUBLIC cxx_std_11) # don't bump
target_link_libraries(utils PRIVATE ${LLAMA_EXTRA_LIBS})
if (BUILD_SHARED_LIBS)
set_target_properties(utils PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
add_library(ggml OBJECT
ggml.c
@@ -226,6 +229,9 @@ add_library(ggml OBJECT
target_include_directories(ggml PUBLIC .)
target_compile_features(ggml PUBLIC c_std_11) # don't bump
target_link_libraries(ggml PRIVATE Threads::Threads ${LLAMA_EXTRA_LIBS})
if (BUILD_SHARED_LIBS)
set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
add_library(llama
llama.cpp
@@ -234,6 +240,10 @@ add_library(llama
target_include_directories(llama PUBLIC .)
target_compile_features(llama PUBLIC cxx_std_11) # don't bump
target_link_libraries(llama PRIVATE utils ggml ${LLAMA_EXTRA_LIBS})
if (BUILD_SHARED_LIBS)
set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
endif()
#
# Executables

View File

@@ -7,8 +7,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
**Hot topics:**
- [Roadmap (short-term)](https://github.com/ggerganov/llama.cpp/discussions/457)
- New C-style API is now available: https://github.com/ggerganov/llama.cpp/pull/370
- [Added Alpaca support](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
- Cache input prompts for faster initialization: https://github.com/ggerganov/llama.cpp/issues/64
- Create a `llama.cpp` logo: https://github.com/ggerganov/llama.cpp/issues/105

View File

@@ -36,7 +36,8 @@ fname_out = sys.argv[3]
fout = open(fname_out, "wb")
fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
fout.write(struct.pack("i", 0x67676d66)) # magic: ggmf in hex
fout.write(struct.pack("i", 1)) # file version
fout.write(struct.pack("i", n_vocab))
fout.write(struct.pack("i", n_embd))
fout.write(struct.pack("i", n_mult))
@@ -49,27 +50,21 @@ fout.write(struct.pack("i", 4))
# This loop unchanged from convert-pth-to-ggml.py:
for i in range(tokenizer.vocab_size()):
if tokenizer.is_unknown(i):
# "<unk>" token (translated as ??)
text = " \u2047 ".encode("utf-8")
fout.write(struct.pack("i", len(text)))
fout.write(text)
elif tokenizer.is_control(i):
# "<s>"/"</s>" tokens
fout.write(struct.pack("i", 0))
text = b""
elif tokenizer.is_byte(i):
# "<U+XX>" tokens (which may be invalid UTF-8)
piece = tokenizer.id_to_piece(i)
if len(piece) != 6:
print("Invalid token: " + piece)
print(f"Invalid token: {piece}")
sys.exit(1)
byte_value = int(piece[3:-1], 16)
fout.write(struct.pack("i", 1))
fout.write(struct.pack("B", byte_value))
text = struct.pack("B", byte_value)
else:
# normal token. Uses U+2581 (LOWER ONE EIGHTH BLOCK) to represent spaces.
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
fout.write(struct.pack("i", len(text)))
fout.write(text)
fout.write(struct.pack("i", len(text)))
fout.write(text)
fout.write(struct.pack("f", tokenizer.get_score(i)))
def write_header(shape, dst_name, ftype_cur):
sname = dst_name.encode('utf-8')

View File

@@ -1,66 +0,0 @@
import os
import sys
from tqdm import tqdm
import requests
if len(sys.argv) < 3:
print("Usage: download-pth.py dir-model model-type\n")
print(" model-type: Available models 7B, 13B, 30B or 65B")
sys.exit(1)
modelsDir = sys.argv[1]
model = sys.argv[2]
num = {
"7B": 1,
"13B": 2,
"30B": 4,
"65B": 8,
}
if model not in num:
print(f"Error: model {model} is not valid, provide 7B, 13B, 30B or 65B")
sys.exit(1)
print(f"Downloading model {model}")
files = ["checklist.chk", "params.json"]
for i in range(num[model]):
files.append(f"consolidated.0{i}.pth")
resolved_path = os.path.abspath(os.path.join(modelsDir, model))
os.makedirs(resolved_path, exist_ok=True)
for file in files:
dest_path = os.path.join(resolved_path, file)
if os.path.exists(dest_path):
print(f"Skip file download, it already exists: {file}")
continue
url = f"https://agi.gpt4.org/llama/LLaMA/{model}/{file}"
response = requests.get(url, stream=True)
with open(dest_path, 'wb') as f:
with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
t.update(len(chunk))
files2 = ["tokenizer_checklist.chk", "tokenizer.model"]
for file in files2:
dest_path = os.path.join(modelsDir, file)
if os.path.exists(dest_path):
print(f"Skip file download, it already exists: {file}")
continue
url = f"https://agi.gpt4.org/llama/LLaMA/{file}"
response = requests.get(url, stream=True)
with open(dest_path, 'wb') as f:
with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
t.update(len(chunk))

View File

@@ -102,6 +102,9 @@ struct llama_context {
// decode output (2-dimensional array: [n_tokens][n_vocab])
std::vector<float> logits;
bool logits_all = false;
// input embedding (1-dimensional array: [n_embd])
std::vector<float> embedding;
};
struct llama_context_params llama_context_default_params() {
@@ -112,6 +115,7 @@ struct llama_context_params llama_context_default_params() {
/*.f16_kv =*/ false,
/*.logits_all =*/ false,
/*.vocab_only =*/ false,
/*.embedding =*/ false,
};
return result;
@@ -592,8 +596,6 @@ static bool llama_model_load(
fin.close();
}
lctx.logits.reserve(lctx.model.hparams.n_ctx);
lctx.t_load_us = ggml_time_us() - t_start_us;
return true;
@@ -727,11 +729,13 @@ static bool llama_eval_internal(
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
struct ggml_tensor * V_trans =
ggml_permute(ctx0,
ggml_reshape_3d(ctx0,
ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
n_embd/n_head, n_head, n_past + N),
1, 2, 0, 3);
ggml_cpy(ctx0,
ggml_permute(ctx0,
ggml_reshape_3d(ctx0,
ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
n_embd/n_head, n_head, n_past + N),
1, 2, 0, 3),
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
// KQV = transpose(V) * KQ_soft_max
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
@@ -789,6 +793,9 @@ static bool llama_eval_internal(
inpL = cur;
}
// used at the end to optionally extract the embeddings
struct ggml_tensor * embeddings = NULL;
// norm
{
inpL = ggml_rms_norm(ctx0, inpL);
@@ -797,6 +804,8 @@ static bool llama_eval_internal(
inpL = ggml_mul(ctx0,
ggml_repeat(ctx0, model.norm, inpL),
inpL);
embeddings = inpL;
}
// lm_head
@@ -819,15 +828,26 @@ static bool llama_eval_internal(
//embd_w.resize(n_vocab*N);
//memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
auto & logits_out = lctx.logits;
// extract logits
{
auto & logits_out = lctx.logits;
if (lctx.logits_all) {
logits_out.resize(n_vocab * N);
memcpy(logits_out.data(), (float *) ggml_get_data(inpL), sizeof(float)*n_vocab*N);
} else {
// return result for just the last token
logits_out.resize(n_vocab);
memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
if (lctx.logits_all) {
logits_out.resize(n_vocab * N);
memcpy(logits_out.data(), (float *) ggml_get_data(inpL), sizeof(float)*n_vocab*N);
} else {
// return result for just the last token
logits_out.resize(n_vocab);
memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
}
}
// extract embeddings
if (lctx.embedding.size()) {
auto & embedding_out = lctx.embedding;
embedding_out.resize(n_embd);
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
}
if (mem_per_token == 0) {
@@ -1414,6 +1434,20 @@ struct llama_context * llama_init_from_file(
return nullptr;
}
// reserve memory for context buffers
{
const auto & hparams = ctx->model.hparams;
if (params.logits_all) {
ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
} else {
ctx->logits.reserve(hparams.n_ctx);
}
if (params.embedding){
ctx->embedding.reserve(hparams.n_embd);
}
}
return ctx;
}
@@ -1482,6 +1516,10 @@ float * llama_get_logits(struct llama_context * ctx) {
return ctx->logits.data();
}
float * llama_get_embeddings(struct llama_context * ctx) {
return ctx->embedding.data();
}
const char * llama_token_to_str(struct llama_context * ctx, llama_token token) {
if (token >= llama_n_vocab(ctx)) {
return nullptr;

View File

@@ -53,6 +53,7 @@ extern "C" {
bool f16_kv; // use fp16 for KV cache
bool logits_all; // the llama_eval() call computes all logits, not just the last one
bool vocab_only; // only load the vocabulary, no weights
bool embedding; // embedding mode only
};
LLAMA_API struct llama_context_params llama_context_default_params();
@@ -108,6 +109,10 @@ extern "C" {
// Cols: n_vocab
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
// Get the embeddings for the input
// shape: [n_embd] (1-dimensional)
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
// Token Id -> String. Uses the vocabulary in the provided context
LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token);

View File

@@ -199,6 +199,7 @@ int main(int argc, char ** argv) {
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.logits_all = params.perplexity;
lparams.embedding = params.embedding;
ctx = llama_init_from_file(params.model.c_str(), lparams);
@@ -258,6 +259,9 @@ int main(int argc, char ** argv) {
params.interactive = true;
}
// determine newline token
auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
fprintf(stderr, "\n");
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
@@ -289,6 +293,7 @@ int main(int argc, char ** argv) {
std::vector<llama_token> embd;
int last_n_size = params.repeat_last_n;
std::vector<llama_token> last_n_tokens(last_n_size);
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
@@ -321,6 +326,27 @@ int main(int argc, char ** argv) {
// the first thing we will do is to output the prompt, so set color accordingly
set_console_state(CONSOLE_STATE_PROMPT);
if (params.embedding){
embd = embd_inp;
if (embd.size() > 0) {
if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
}
}
const auto embeddings = llama_get_embeddings(ctx);
// TODO: print / use the embeddings
if (params.use_color) {
printf(ANSI_COLOR_RESET);
}
return 0;
}
while (remaining_tokens > 0 || params.interactive) {
// predict
if (embd.size() > 0) {
@@ -359,6 +385,16 @@ int main(int argc, char ** argv) {
last_n_tokens.push_back(id);
}
// replace end of text token with newline token when in interactive mode
if (id == llama_token_eos() && params.interactive) {
id = llama_token_newline.front();
if (params.antiprompt.size() != 0) {
// tokenize and inject first reverse prompt
const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
}
}
// add it to the context
embd.push_back(id);
@@ -451,12 +487,8 @@ int main(int argc, char ** argv) {
// end of text token
if (embd.back() == llama_token_eos()) {
if (params.interactive) {
is_interacting = true;
} else {
fprintf(stderr, " [end of text]\n");
break;
}
fprintf(stderr, " [end of text]\n");
break;
}
// In interactive mode, respect the maximum number of tokens and drop back to user input when reached.

View File

@@ -57,6 +57,7 @@ def main():
# )
args = parser.parse_args()
args.models_path = os.path.abspath(args.models_path)
if not os.path.isfile(args.quantize_script_path):
print(

View File

@@ -117,6 +117,10 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.model = argv[i];
} else if (arg == "-i" || arg == "--interactive") {
params.interactive = true;
} else if (arg == "--embedding") {
params.embedding = true;
} else if (arg == "--interactive-start") {
params.interactive = true;
} else if (arg == "--interactive-first") {
params.interactive_start = true;
} else if (arg == "-ins" || arg == "--instruct") {

View File

@@ -32,13 +32,17 @@ struct gpt_params {
std::string model = "models/lamma-7B/ggml-model.bin"; // model path
std::string prompt = "";
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
bool memory_f16 = false; // use f16 instead of f32 for memory kv
bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs
bool interactive = false; // interactive mode
bool embedding = false; // get only sentence embedding
bool interactive_start = false; // wait for user input immediately
bool instruct = false; // instruction mode (used for Alpaca models)
bool ignore_eos = false; // do not stop generating after eos
bool perplexity = false; // compute perplexity over the prompt