mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-03-05 14:33:24 +02:00
Compare commits
14 Commits
master-aac
...
master-65b
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
65bdd52a86 | ||
|
|
fdd1860911 | ||
|
|
c943d823c1 | ||
|
|
f2c754e1c3 | ||
|
|
11da1a85cd | ||
|
|
235b610d65 | ||
|
|
b061ba9e2a | ||
|
|
527b6fba1d | ||
|
|
d7b7484f74 | ||
|
|
7487137227 | ||
|
|
bbca06e269 | ||
|
|
fb98254f99 | ||
|
|
049aa16b8c | ||
|
|
2322ec223a |
@@ -250,6 +250,15 @@ if (LLAMA_CUBLAS)
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
||||
if (LLAMA_CUDA_DMMV_F16)
|
||||
set(CMAKE_CUDA_ARCHITECTURES "61") # needed for f16 CUDA intrinsics
|
||||
else()
|
||||
set(CMAKE_CUDA_ARCHITECTURES "52") # lowest CUDA 12 standard
|
||||
endif()
|
||||
endif()
|
||||
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
||||
|
||||
else()
|
||||
message(WARNING "cuBLAS not found")
|
||||
endif()
|
||||
@@ -493,22 +502,6 @@ if (BUILD_SHARED_LIBS)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (GGML_SOURCES_CUDA)
|
||||
message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
|
||||
set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES "native")
|
||||
set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
|
||||
|
||||
set_property(TARGET ggml_static PROPERTY CUDA_ARCHITECTURES "native")
|
||||
set_property(TARGET ggml_static PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
|
||||
|
||||
if (BUILD_SHARED_LIBS)
|
||||
set_property(TARGET ggml_shared PROPERTY CUDA_ARCHITECTURES "native")
|
||||
set_property(TARGET ggml_shared PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
|
||||
endif()
|
||||
|
||||
set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES "native")
|
||||
endif()
|
||||
|
||||
|
||||
#
|
||||
# programs, examples and tests
|
||||
|
||||
23
README.md
23
README.md
@@ -9,12 +9,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||
|
||||
**Hot topics:**
|
||||
|
||||
- p1 : LLM-based code completion engine at the edge : https://github.com/ggml-org/p1/discussions/1
|
||||
- Roadmap June 2023: https://github.com/ggerganov/llama.cpp/discussions/1729
|
||||
- GPU support with Metal (Apple Silicon): https://github.com/ggerganov/llama.cpp/pull/1642
|
||||
- High-quality 2,3,4,5,6-bit quantization: https://github.com/ggerganov/llama.cpp/pull/1684
|
||||
- Multi-GPU support: https://github.com/ggerganov/llama.cpp/pull/1607
|
||||
- Training LLaMA models from scratch: https://github.com/ggerganov/llama.cpp/pull/1652
|
||||
- CPU threading improvements: https://github.com/ggerganov/llama.cpp/pull/1632
|
||||
|
||||
<details>
|
||||
<summary>Table of Contents</summary>
|
||||
@@ -33,6 +29,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||
<li><a href="#quantization">Quantization</a></li>
|
||||
<li><a href="#interactive-mode">Interactive mode</a></li>
|
||||
<li><a href="#instruction-mode-with-alpaca">Instruction mode with Alpaca</a></li>
|
||||
<li><a href="#using-openllama">Using OpenLLaMA</a></li>
|
||||
<li><a href="#using-gpt4all">Using GPT4All</a></li>
|
||||
<li><a href="#using-pygmalion-7b--metharme-7b">Using Pygmalion 7B & Metharme 7B</a></li>
|
||||
<li><a href="#obtaining-the-facebook-llama-original-model-and-stanford-alpaca-model-data">Obtaining the Facebook LLaMA original model and Stanford Alpaca model data</a></li>
|
||||
@@ -344,7 +341,7 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
||||
| LLAMA_CUDA_DMMV_Y | Positive integer | 1 | Block size in y direction for the CUDA dequantization + mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
|
||||
| LLAMA_CUDA_DMMV_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. |
|
||||
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value 2 1 can improve performance for slow GPUs. |
|
||||
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
||||
|
||||
- #### CLBlast
|
||||
|
||||
@@ -378,7 +375,7 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
```sh
|
||||
git clone https://github.com/CNugteren/CLBlast.git
|
||||
mkdir CLBlast/build
|
||||
cd CLBLast/build
|
||||
cd CLBlast/build
|
||||
cmake .. -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
|
||||
cmake --build . --config Release
|
||||
cmake --install . --prefix /some/path
|
||||
@@ -547,6 +544,13 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
|
||||
>
|
||||
```
|
||||
|
||||
### Using [OpenLLaMA](https://github.com/openlm-research/open_llama)
|
||||
|
||||
OpenLLaMA is an openly licensed reproduction of Meta's original LLaMA model. It uses the same architecture and is a drop-in replacement for the original LLaMA weights.
|
||||
|
||||
- Download the [3B](https://huggingface.co/openlm-research/open_llama_3b), [7B](https://huggingface.co/openlm-research/open_llama_7b), or [13B](https://huggingface.co/openlm-research/open_llama_13b) model from Hugging Face.
|
||||
- Convert the model to ggml FP16 format using `python convert.py <path to OpenLLaMA directory>`
|
||||
|
||||
### Using [GPT4All](https://github.com/nomic-ai/gpt4all)
|
||||
|
||||
- Obtain the `tokenizer.model` file from LLaMA model and put it to `models`
|
||||
@@ -676,12 +680,13 @@ Upon completion of the aforementioned steps, you will have successfully compiled
|
||||
```
|
||||
GGML_OPENCL_PLATFORM=0
|
||||
GGML_OPENCL_DEVICE=0
|
||||
export LD_LIBRARY_PATH=/system/vendor/lib64:$LD_LIBRARY_PATH
|
||||
./main (...)
|
||||
export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH
|
||||
```
|
||||
|
||||
For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle.
|
||||
|
||||
Place your desired model into the `/llama.cpp/models/` directory and execute the `./main (...)` script.
|
||||
|
||||
### Docker
|
||||
|
||||
#### Prerequisites
|
||||
|
||||
95
convert.py
95
convert.py
@@ -130,6 +130,14 @@ TENSORS_LIST = make_tensors_list()
|
||||
TENSORS_SET = set(TENSORS_LIST)
|
||||
|
||||
|
||||
def find_n_mult(n_ff: int, n_embd: int) -> int:
|
||||
# hardcoded magic range
|
||||
for n_mult in range(256, 1, -1):
|
||||
calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
|
||||
if calc_ff == n_ff:
|
||||
return n_mult
|
||||
return 1
|
||||
|
||||
@dataclass
|
||||
class Params:
|
||||
n_vocab: int
|
||||
@@ -137,21 +145,61 @@ class Params:
|
||||
n_mult: int
|
||||
n_head: int
|
||||
n_layer: int
|
||||
file_type: GGMLFileType
|
||||
|
||||
@staticmethod
|
||||
def guessed(model: 'LazyModel', file_type: GGMLFileType) -> 'Params':
|
||||
n_vocab, n_embd = model["tok_embeddings.weight"].shape
|
||||
def guessed(model: 'LazyModel') -> 'Params':
|
||||
# try transformer naming first
|
||||
n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
|
||||
|
||||
# try transformer naming first
|
||||
if "model.layers.0.self_attn.q_proj.weight" in model:
|
||||
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
|
||||
else:
|
||||
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
|
||||
|
||||
n_head=n_embd // 128 # guessed
|
||||
|
||||
return Params(
|
||||
n_vocab=n_vocab,
|
||||
n_embd=n_embd,
|
||||
n_mult=256,
|
||||
n_head=n_embd // 128,
|
||||
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model),
|
||||
file_type=file_type,
|
||||
n_head=n_head,
|
||||
n_layer=n_layer,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
|
||||
config = json.load(open(config_path))
|
||||
|
||||
n_vocab = config["vocab_size"];
|
||||
n_embd = config["hidden_size"];
|
||||
n_head = config["num_attention_heads"];
|
||||
n_layer = config["num_hidden_layers"];
|
||||
n_ff = config["intermediate_size"];
|
||||
|
||||
n_mult = find_n_mult(n_ff, n_embd);
|
||||
|
||||
return Params(
|
||||
n_vocab=n_vocab,
|
||||
n_embd=n_embd,
|
||||
n_mult=n_mult,
|
||||
n_head=n_head,
|
||||
n_layer=n_layer,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def load(model_plus: 'ModelPlus') -> 'Params':
|
||||
orig_config_path = model_plus.paths[0].parent / "params.json"
|
||||
hf_transformer_config_path = model_plus.paths[0].parent / "config.json"
|
||||
|
||||
if hf_transformer_config_path.exists():
|
||||
params = Params.loadHFTransformerJson(model_plus.model, hf_transformer_config_path)
|
||||
else:
|
||||
params = Params.guessed(model_plus.model)
|
||||
|
||||
print(f'params: n_vocab:{params.n_vocab} n_embd:{params.n_embd} n_mult:{params.n_mult} n_head:{params.n_head} n_layer:{params.n_layer}')
|
||||
return params
|
||||
|
||||
|
||||
class SentencePieceVocab:
|
||||
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
|
||||
@@ -595,18 +643,17 @@ def permute_lazy(lazy_tensor: LazyTensor, n_head: int) -> LazyTensor:
|
||||
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
|
||||
|
||||
|
||||
def convert_transformers_to_orig(model: LazyModel) -> LazyModel:
|
||||
def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
|
||||
out: LazyModel = {}
|
||||
out["tok_embeddings.weight"] = model["model.embed_tokens.weight"]
|
||||
out["norm.weight"] = model["model.norm.weight"]
|
||||
out["output.weight"] = model["lm_head.weight"]
|
||||
|
||||
n_head = model["model.layers.0.self_attn.q_proj.weight"].shape[1] // 128
|
||||
for i in itertools.count():
|
||||
if f"model.layers.{i}.self_attn.q_proj.weight" not in model:
|
||||
break
|
||||
out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], n_head)
|
||||
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], n_head)
|
||||
out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
|
||||
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head)
|
||||
out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
|
||||
out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]
|
||||
|
||||
@@ -920,7 +967,7 @@ class OutputFile:
|
||||
def __init__(self, fname_out: Path) -> None:
|
||||
self.fout = open(fname_out, "wb")
|
||||
|
||||
def write_file_header(self, params: Params) -> None:
|
||||
def write_file_header(self, params: Params, file_type: GGMLFileType) -> None:
|
||||
self.fout.write(b"ggjt"[::-1]) # magic
|
||||
values = [
|
||||
1, # file version
|
||||
@@ -930,7 +977,7 @@ class OutputFile:
|
||||
params.n_head,
|
||||
params.n_layer,
|
||||
params.n_embd // params.n_head, # rot (obsolete)
|
||||
params.file_type.value,
|
||||
file_type.value,
|
||||
]
|
||||
self.fout.write(struct.pack("i" * len(values), *values))
|
||||
|
||||
@@ -951,17 +998,17 @@ class OutputFile:
|
||||
def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
|
||||
of = OutputFile(fname_out)
|
||||
params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0,
|
||||
n_head=1, n_layer=0, file_type=GGMLFileType.AllF32)
|
||||
n_head=1, n_layer=0)
|
||||
of = OutputFile(fname_out)
|
||||
of.write_file_header(params)
|
||||
of.write_file_header(params, file_type=GGMLFileType.AllF32)
|
||||
of.write_vocab(vocab)
|
||||
of.fout.close()
|
||||
|
||||
@staticmethod
|
||||
def write_all(fname_out: Path, params: Params, model: LazyModel, vocab: Vocab) -> None:
|
||||
def write_all(fname_out: Path, params: Params, file_type: GGMLFileType, model: LazyModel, vocab: Vocab) -> None:
|
||||
check_vocab_size(params, vocab)
|
||||
of = OutputFile(fname_out)
|
||||
of.write_file_header(params)
|
||||
of.write_file_header(params, file_type)
|
||||
print("Writing vocab...")
|
||||
of.write_vocab(vocab)
|
||||
|
||||
@@ -997,11 +1044,11 @@ def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFi
|
||||
raise Exception(f"Unexpected combination of types: {name_to_type}")
|
||||
|
||||
|
||||
def do_necessary_conversions(model: LazyModel) -> LazyModel:
|
||||
def do_necessary_conversions(model: LazyModel, params: Params) -> LazyModel:
|
||||
model = handle_quantization(model)
|
||||
|
||||
if "lm_head.weight" in model:
|
||||
model = convert_transformers_to_orig(model)
|
||||
model = convert_transformers_to_orig(model, params)
|
||||
model = filter_and_sort_tensors(model)
|
||||
|
||||
return model
|
||||
@@ -1107,14 +1154,14 @@ def load_vocab(path: Path) -> SentencePieceVocab:
|
||||
return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
|
||||
|
||||
|
||||
def default_outfile(model_paths: List[Path], params: Params) -> Path:
|
||||
def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path:
|
||||
namestr = {
|
||||
GGMLFileType.AllF32: "f32",
|
||||
GGMLFileType.MostlyF16: "f16",
|
||||
GGMLFileType.MostlyQ4_0: "q4_0",
|
||||
GGMLFileType.MostlyQ4_1: "q4_1",
|
||||
GGMLFileType.PerLayerIsQ4_1: "q4_1",
|
||||
}[params.file_type]
|
||||
}[file_type]
|
||||
ret = model_paths[0].parent / f"ggml-model-{namestr}.bin"
|
||||
if ret in model_paths:
|
||||
sys.stderr.write(
|
||||
@@ -1164,13 +1211,13 @@ def main(args_in: Optional[List[str]] = None) -> None:
|
||||
else:
|
||||
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
|
||||
vocab = load_vocab(vocab_dir)
|
||||
params = Params.load(model_plus)
|
||||
model = model_plus.model
|
||||
model = do_necessary_conversions(model)
|
||||
model = do_necessary_conversions(model, params)
|
||||
output_type = pick_output_type(model, args.outtype)
|
||||
model = convert_to_output_type(model, output_type)
|
||||
params = Params.guessed(model, output_type)
|
||||
outfile = args.outfile or default_outfile(model_plus.paths, params)
|
||||
OutputFile.write_all(outfile, params, model, vocab)
|
||||
outfile = args.outfile or default_outfile(model_plus.paths, output_type)
|
||||
OutputFile.write_all(outfile, params, output_type, model, vocab)
|
||||
print(f"Wrote {outfile}")
|
||||
|
||||
|
||||
|
||||
@@ -536,7 +536,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
|
||||
return res;
|
||||
}
|
||||
|
||||
struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
|
||||
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) {
|
||||
auto lparams = llama_context_default_params();
|
||||
|
||||
lparams.n_ctx = params.n_ctx;
|
||||
@@ -552,25 +552,33 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
|
||||
lparams.logits_all = params.perplexity;
|
||||
lparams.embedding = params.embedding;
|
||||
|
||||
llama_context * lctx = llama_init_from_file(params.model.c_str(), lparams);
|
||||
|
||||
if (lctx == NULL) {
|
||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), lparams);
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
||||
return NULL;
|
||||
return std::make_tuple(nullptr, nullptr);
|
||||
}
|
||||
|
||||
llama_context * lctx = llama_new_context_with_model(model, lparams);
|
||||
if (lctx == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
||||
llama_free_model(model);
|
||||
return std::make_tuple(nullptr, nullptr);
|
||||
}
|
||||
|
||||
if (!params.lora_adapter.empty()) {
|
||||
int err = llama_apply_lora_from_file(lctx,
|
||||
int err = llama_model_apply_lora_from_file(model,
|
||||
params.lora_adapter.c_str(),
|
||||
params.lora_base.empty() ? NULL : params.lora_base.c_str(),
|
||||
params.n_threads);
|
||||
if (err != 0) {
|
||||
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
||||
return NULL;
|
||||
llama_free(lctx);
|
||||
llama_free_model(model);
|
||||
return std::make_tuple(nullptr, nullptr);
|
||||
}
|
||||
}
|
||||
|
||||
return lctx;
|
||||
return std::make_tuple(model, lctx);
|
||||
}
|
||||
|
||||
void console_init(console_state & con_st) {
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
#include <random>
|
||||
#include <thread>
|
||||
#include <unordered_map>
|
||||
#include <tuple>
|
||||
|
||||
#if !defined (_WIN32)
|
||||
#include <stdio.h>
|
||||
@@ -95,7 +96,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
|
||||
// Model utils
|
||||
//
|
||||
|
||||
struct llama_context * llama_init_from_gpt_params(const gpt_params & params);
|
||||
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params);
|
||||
|
||||
//
|
||||
// Console utils
|
||||
|
||||
@@ -37,11 +37,12 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_init_backend();
|
||||
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
|
||||
// load the model
|
||||
ctx = llama_init_from_gpt_params(params);
|
||||
if (ctx == NULL) {
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
@@ -90,6 +91,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_print_timings(ctx);
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -107,12 +107,13 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_init_backend();
|
||||
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
g_ctx = &ctx;
|
||||
|
||||
// load the model and apply lora adapter, if any
|
||||
ctx = llama_init_from_gpt_params(params);
|
||||
if (ctx == NULL) {
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
@@ -139,6 +140,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_print_timings(ctx);
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -147,6 +149,7 @@ int main(int argc, char ** argv) {
|
||||
if (params.export_cgraph) {
|
||||
llama_eval_export(ctx, "llama.ggml");
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -666,6 +669,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_print_timings(ctx);
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -149,11 +149,12 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_init_backend();
|
||||
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
|
||||
// load the model and apply lora adapter, if any
|
||||
ctx = llama_init_from_gpt_params(params);
|
||||
if (ctx == NULL) {
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
@@ -169,6 +170,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_print_timings(ctx);
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -320,6 +320,7 @@ int main(int argc, char ** argv) {
|
||||
fprintf(stderr, "Loading model\n");
|
||||
|
||||
const int64_t t_main_start_us = ggml_time_us();
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
|
||||
{
|
||||
@@ -330,10 +331,18 @@ int main(int argc, char ** argv) {
|
||||
lparams.f16_kv = false;
|
||||
lparams.use_mlock = false;
|
||||
|
||||
ctx = llama_init_from_file(params.model.c_str(), lparams);
|
||||
model = llama_load_model_from_file(params.model.c_str(), lparams);
|
||||
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
ctx = llama_new_context_with_model(model, lparams);
|
||||
|
||||
if (ctx == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
||||
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
||||
llama_free_model(model);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
@@ -357,6 +366,7 @@ int main(int argc, char ** argv) {
|
||||
fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
|
||||
"this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
return 1;
|
||||
}
|
||||
included_layers++;
|
||||
@@ -415,6 +425,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
// report timing
|
||||
{
|
||||
const int64_t t_main_end_us = ggml_time_us();
|
||||
|
||||
@@ -35,12 +35,22 @@ int main(int argc, char ** argv) {
|
||||
auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);
|
||||
|
||||
// init
|
||||
auto ctx = llama_init_from_file(params.model.c_str(), lparams);
|
||||
auto model = llama_load_model_from_file(params.model.c_str(), lparams);
|
||||
if (model == nullptr) {
|
||||
return 1;
|
||||
}
|
||||
auto ctx = llama_new_context_with_model(model, lparams);
|
||||
if (ctx == nullptr) {
|
||||
llama_free_model(model);
|
||||
return 1;
|
||||
}
|
||||
auto tokens = std::vector<llama_token>(params.n_ctx);
|
||||
auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true);
|
||||
|
||||
if (n_prompt_tokens < 1) {
|
||||
fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -84,6 +94,8 @@ int main(int argc, char ** argv) {
|
||||
printf("%s", next_token_str);
|
||||
if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
|
||||
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
return 1;
|
||||
}
|
||||
n_past += 1;
|
||||
@@ -91,23 +103,27 @@ int main(int argc, char ** argv) {
|
||||
|
||||
printf("\n\n");
|
||||
|
||||
// free old model
|
||||
// free old context
|
||||
llama_free(ctx);
|
||||
|
||||
// load new model
|
||||
auto ctx2 = llama_init_from_file(params.model.c_str(), lparams);
|
||||
// make new context
|
||||
auto ctx2 = llama_new_context_with_model(model, lparams);
|
||||
|
||||
// Load state (rng, logits, embedding and kv_cache) from file
|
||||
{
|
||||
FILE *fp_read = fopen("dump_state.bin", "rb");
|
||||
if (state_size != llama_get_state_size(ctx2)) {
|
||||
fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
|
||||
llama_free(ctx2);
|
||||
llama_free_model(model);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const size_t ret = fread(state_mem, 1, state_size, fp_read);
|
||||
if (ret != state_size) {
|
||||
fprintf(stderr, "\n%s : failed to read state\n", __func__);
|
||||
llama_free(ctx2);
|
||||
llama_free_model(model);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -138,6 +154,8 @@ int main(int argc, char ** argv) {
|
||||
printf("%s", next_token_str);
|
||||
if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
|
||||
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||
llama_free(ctx2);
|
||||
llama_free_model(model);
|
||||
return 1;
|
||||
}
|
||||
n_past += 1;
|
||||
@@ -145,5 +163,8 @@ int main(int argc, char ** argv) {
|
||||
|
||||
printf("\n\n");
|
||||
|
||||
llama_free(ctx2);
|
||||
llama_free_model(model);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -115,6 +115,7 @@ struct llama_server_context {
|
||||
std::vector<llama_token> embd;
|
||||
std::vector<llama_token> last_n_tokens;
|
||||
|
||||
llama_model * model = nullptr;
|
||||
llama_context * ctx = nullptr;
|
||||
gpt_params params;
|
||||
|
||||
@@ -130,6 +131,10 @@ struct llama_server_context {
|
||||
llama_free(ctx);
|
||||
ctx = nullptr;
|
||||
}
|
||||
if (model) {
|
||||
llama_free_model(model);
|
||||
model = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
void rewind() {
|
||||
@@ -150,8 +155,8 @@ struct llama_server_context {
|
||||
|
||||
bool loadModel(const gpt_params & params_) {
|
||||
params = params_;
|
||||
ctx = llama_init_from_gpt_params(params);
|
||||
if (ctx == nullptr) {
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
if (model == nullptr) {
|
||||
LOG_ERROR("unable to load model", { { "model", params_.model } });
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -68,11 +68,12 @@ int main(int argc, char ** argv)
|
||||
|
||||
llama_init_backend();
|
||||
|
||||
llama_context * ctx ;
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
|
||||
ctx = llama_init_from_gpt_params( params );
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params( params );
|
||||
|
||||
if ( ctx == NULL )
|
||||
if ( model == NULL )
|
||||
{
|
||||
fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
|
||||
return 1;
|
||||
@@ -170,6 +171,7 @@ int main(int argc, char ** argv)
|
||||
} // wend of main loop
|
||||
|
||||
llama_free( ctx );
|
||||
llama_free_model( model );
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -3054,7 +3054,8 @@ int main(int argc, char ** argv) {
|
||||
struct llama_context_params llama_params = llama_context_default_params();
|
||||
llama_params.vocab_only = true;
|
||||
|
||||
struct llama_context * lctx = llama_init_from_file(params.fn_vocab_model, llama_params);
|
||||
struct llama_model * lmodel = llama_load_model_from_file(params.fn_vocab_model, llama_params);
|
||||
struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
|
||||
|
||||
struct llama_vocab vocab;
|
||||
{
|
||||
@@ -3395,6 +3396,8 @@ int main(int argc, char ** argv) {
|
||||
delete[] compute_addr;
|
||||
delete[] compute_buf_0;
|
||||
delete[] compute_buf_1;
|
||||
llama_free(lctx);
|
||||
llama_free_model(lmodel);
|
||||
ggml_free(model.ctx);
|
||||
|
||||
return 0;
|
||||
|
||||
50
flake.nix
50
flake.nix
@@ -9,27 +9,33 @@
|
||||
inherit (pkgs.stdenv) isAarch64 isDarwin;
|
||||
inherit (pkgs.lib) optionals;
|
||||
isM1 = isAarch64 && isDarwin;
|
||||
osSpecific =
|
||||
if isM1 then with pkgs.darwin.apple_sdk_11_0.frameworks; [ Accelerate MetalKit MetalPerformanceShaders MetalPerformanceShadersGraph ]
|
||||
else if isDarwin then with pkgs.darwin.apple_sdk.frameworks; [ Accelerate CoreGraphics CoreVideo ]
|
||||
else [ ];
|
||||
pkgs = import nixpkgs {
|
||||
inherit system;
|
||||
};
|
||||
llama-python = pkgs.python310.withPackages (ps: with ps; [
|
||||
numpy
|
||||
sentencepiece
|
||||
]);
|
||||
in
|
||||
{
|
||||
osSpecific = if isM1 then
|
||||
with pkgs.darwin.apple_sdk_11_0.frameworks; [
|
||||
Accelerate
|
||||
MetalKit
|
||||
MetalPerformanceShaders
|
||||
MetalPerformanceShadersGraph
|
||||
]
|
||||
else if isDarwin then
|
||||
with pkgs.darwin.apple_sdk.frameworks; [
|
||||
Accelerate
|
||||
CoreGraphics
|
||||
CoreVideo
|
||||
]
|
||||
else
|
||||
[ ];
|
||||
pkgs = import nixpkgs { inherit system; };
|
||||
llama-python =
|
||||
pkgs.python310.withPackages (ps: with ps; [ numpy sentencepiece ]);
|
||||
in {
|
||||
packages.default = pkgs.stdenv.mkDerivation {
|
||||
name = "llama.cpp";
|
||||
src = ./.;
|
||||
postPatch =
|
||||
if isM1 then ''
|
||||
substituteInPlace ./ggml-metal.m \
|
||||
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/ggml-metal.metal\";"
|
||||
'' else "";
|
||||
postPatch = if isM1 then ''
|
||||
substituteInPlace ./ggml-metal.m \
|
||||
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
||||
'' else
|
||||
"";
|
||||
nativeBuildInputs = with pkgs; [ cmake ];
|
||||
buildInputs = osSpecific;
|
||||
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" ] ++ (optionals isM1 [
|
||||
@@ -62,11 +68,7 @@
|
||||
};
|
||||
apps.default = self.apps.${system}.llama;
|
||||
devShells.default = pkgs.mkShell {
|
||||
packages = with pkgs; [
|
||||
cmake
|
||||
llama-python
|
||||
] ++ osSpecific;
|
||||
packages = with pkgs; [ cmake llama-python ] ++ osSpecific;
|
||||
};
|
||||
}
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
135
ggml.c
135
ggml.c
@@ -24,6 +24,7 @@
|
||||
#include <stdio.h>
|
||||
#include <float.h>
|
||||
#include <limits.h>
|
||||
#include <stdarg.h>
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
#include <unistd.h>
|
||||
@@ -4734,10 +4735,19 @@ struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * nam
|
||||
return tensor;
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) {
|
||||
va_list args;
|
||||
va_start(args, fmt);
|
||||
vsnprintf(tensor->name, sizeof(tensor->name), fmt, args);
|
||||
va_end(args);
|
||||
return tensor;
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_view_tensor(
|
||||
struct ggml_context * ctx,
|
||||
const struct ggml_tensor * src) {
|
||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
|
||||
ggml_format_name(result, "%s (view)", src->name);
|
||||
|
||||
result->nb[0] = src->nb[0];
|
||||
result->nb[1] = src->nb[1];
|
||||
@@ -5899,6 +5909,11 @@ struct ggml_tensor * ggml_cpy_impl(
|
||||
|
||||
// make a view of the destination
|
||||
struct ggml_tensor * result = ggml_view_tensor(ctx, b);
|
||||
if (strlen(b->name) > 0) {
|
||||
ggml_format_name(result, "%s (copy of %s)", b->name, a->name);
|
||||
} else {
|
||||
ggml_format_name(result, "%s (copy)", a->name);
|
||||
}
|
||||
|
||||
result->op = GGML_OP_CPY;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
@@ -5935,6 +5950,7 @@ struct ggml_tensor * ggml_cont_impl(
|
||||
}
|
||||
|
||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||
ggml_format_name(result, "%s (cont)", a->name);
|
||||
|
||||
result->op = GGML_OP_CONT;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
@@ -5978,6 +5994,7 @@ struct ggml_tensor * ggml_reshape(
|
||||
}
|
||||
|
||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data);
|
||||
ggml_format_name(result, "%s (reshaped)", a->name);
|
||||
|
||||
result->op = GGML_OP_RESHAPE;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
@@ -6002,6 +6019,7 @@ struct ggml_tensor * ggml_reshape_1d(
|
||||
|
||||
const int64_t ne[1] = { ne0 };
|
||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a->data);
|
||||
ggml_format_name(result, "%s (reshaped)", a->name);
|
||||
|
||||
result->op = GGML_OP_RESHAPE;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
@@ -6027,6 +6045,7 @@ struct ggml_tensor * ggml_reshape_2d(
|
||||
|
||||
const int64_t ne[2] = { ne0, ne1 };
|
||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data);
|
||||
ggml_format_name(result, "%s (reshaped)", a->name);
|
||||
|
||||
result->op = GGML_OP_RESHAPE;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
@@ -6053,6 +6072,7 @@ struct ggml_tensor * ggml_reshape_3d(
|
||||
|
||||
const int64_t ne[3] = { ne0, ne1, ne2 };
|
||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data);
|
||||
ggml_format_name(result, "%s (reshaped)", a->name);
|
||||
|
||||
result->op = GGML_OP_RESHAPE;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
@@ -6081,6 +6101,7 @@ struct ggml_tensor * ggml_reshape_4d(
|
||||
|
||||
const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
|
||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data);
|
||||
ggml_format_name(result, "%s (reshaped)", a->name);
|
||||
|
||||
result->op = GGML_OP_RESHAPE;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
@@ -6105,10 +6126,12 @@ struct ggml_tensor * ggml_view_1d(
|
||||
}
|
||||
|
||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
|
||||
ggml_format_name(result, "%s (view)", a->name);
|
||||
|
||||
ggml_scratch_save(ctx);
|
||||
|
||||
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
||||
ggml_set_name(offs, "offset");
|
||||
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
||||
|
||||
ggml_scratch_load(ctx);
|
||||
@@ -6141,10 +6164,12 @@ struct ggml_tensor * ggml_view_2d(
|
||||
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
|
||||
|
||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
|
||||
ggml_format_name(result, "%s (view)", a->name);
|
||||
|
||||
ggml_scratch_save(ctx);
|
||||
|
||||
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
||||
ggml_set_name(offs, "offset");
|
||||
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
||||
|
||||
ggml_scratch_load(ctx);
|
||||
@@ -6183,10 +6208,12 @@ struct ggml_tensor * ggml_view_3d(
|
||||
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
|
||||
|
||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
|
||||
ggml_format_name(result, "%s (view)", a->name);
|
||||
|
||||
ggml_scratch_save(ctx);
|
||||
|
||||
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
||||
ggml_set_name(offs, "offset");
|
||||
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
||||
|
||||
ggml_scratch_load(ctx);
|
||||
@@ -6227,10 +6254,12 @@ struct ggml_tensor * ggml_view_4d(
|
||||
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
|
||||
|
||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
|
||||
ggml_format_name(result, "%s (view)", a->name);
|
||||
|
||||
ggml_scratch_save(ctx);
|
||||
|
||||
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
||||
ggml_set_name(offs, "offset");
|
||||
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
||||
|
||||
ggml_scratch_load(ctx);
|
||||
@@ -6276,6 +6305,7 @@ struct ggml_tensor * ggml_permute(
|
||||
}
|
||||
|
||||
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
||||
ggml_format_name(result, "%s (permuted)", a->name);
|
||||
|
||||
int ne[GGML_MAX_DIMS];
|
||||
int nb[GGML_MAX_DIMS];
|
||||
@@ -6335,6 +6365,7 @@ struct ggml_tensor * ggml_transpose(
|
||||
}
|
||||
|
||||
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
||||
ggml_format_name(result, "%s (transposed)", a->name);
|
||||
|
||||
result->ne[0] = a->ne[1];
|
||||
result->ne[1] = a->ne[0];
|
||||
@@ -16004,7 +16035,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
|
||||
GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
|
||||
|
||||
if (strlen(node->name) == 0) {
|
||||
snprintf(node->name, sizeof(node->name), "leaf_%d", cgraph->n_leafs);
|
||||
ggml_format_name(node, "leaf_%d", cgraph->n_leafs);
|
||||
}
|
||||
|
||||
cgraph->leafs[cgraph->n_leafs] = node;
|
||||
@@ -16013,7 +16044,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
|
||||
GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
|
||||
|
||||
if (strlen(node->name) == 0) {
|
||||
snprintf(node->name, sizeof(node->name), "node_%d", cgraph->n_nodes);
|
||||
ggml_format_name(node, "node_%d", cgraph->n_nodes);
|
||||
}
|
||||
|
||||
cgraph->nodes[cgraph->n_nodes] = node;
|
||||
@@ -17397,6 +17428,26 @@ static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgr
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void ggml_graph_dump_dot_node_edge(FILE * fp, const struct ggml_cgraph * gb, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
|
||||
struct ggml_tensor * gparent = ggml_graph_get_parent(gb, node);
|
||||
struct ggml_tensor * gparent0 = ggml_graph_get_parent(gb, parent);
|
||||
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"%s\"; ]\n",
|
||||
gparent0 ? (void *) gparent0 : (void *) parent,
|
||||
gparent0 ? "g" : "x",
|
||||
gparent ? (void *) gparent : (void *) node,
|
||||
gparent ? "g" : "x",
|
||||
gparent ? "empty" : "vee",
|
||||
gparent ? "dashed" : "solid",
|
||||
label);
|
||||
}
|
||||
|
||||
static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
|
||||
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"%s\"; ]\n",
|
||||
(void *) parent, "x",
|
||||
(void *) node, "x",
|
||||
label);
|
||||
}
|
||||
|
||||
void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
|
||||
char color[16];
|
||||
|
||||
@@ -17432,7 +17483,9 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
||||
(void *) node, color);
|
||||
|
||||
if (strlen(node->name) > 0) {
|
||||
fprintf(fp, "%s |", node->name);
|
||||
fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
|
||||
} else {
|
||||
fprintf(fp, "(%s)|", ggml_type_name(node->type));
|
||||
}
|
||||
|
||||
if (node->n_dims == 2) {
|
||||
@@ -17441,7 +17494,6 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
||||
fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]);
|
||||
}
|
||||
|
||||
|
||||
if (node->grad) {
|
||||
fprintf(fp, " | <g>%s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]);
|
||||
} else {
|
||||
@@ -17460,18 +17512,29 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
||||
(void *) node, color);
|
||||
|
||||
if (strlen(node->name) > 0) {
|
||||
fprintf(fp, "%s | ", node->name);
|
||||
fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
|
||||
} else {
|
||||
fprintf(fp, "(%s)|", ggml_type_name(node->type));
|
||||
}
|
||||
if (ggml_nelements(node) == 1) {
|
||||
if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
|
||||
fprintf(fp, "%d", ggml_get_i32_1d(node, 0));
|
||||
|
||||
fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
|
||||
if (ggml_nelements(node) < 5) {
|
||||
fprintf(fp, " | (");
|
||||
for (int j = 0; j < ggml_nelements(node); j++) {
|
||||
if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
|
||||
fprintf(fp, "%d", ggml_get_i32_1d(node, j));
|
||||
}
|
||||
else if (node->type == GGML_TYPE_F32 || node->type == GGML_TYPE_F16) {
|
||||
fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, j));
|
||||
}
|
||||
else {
|
||||
fprintf(fp, "#");
|
||||
}
|
||||
if (j < ggml_nelements(node) - 1) {
|
||||
fprintf(fp, ", ");
|
||||
}
|
||||
}
|
||||
else {
|
||||
fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, 0));
|
||||
}
|
||||
}
|
||||
else {
|
||||
fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
|
||||
fprintf(fp, ")");
|
||||
}
|
||||
fprintf(fp, "\"; ]\n");
|
||||
}
|
||||
@@ -17479,30 +17542,20 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
||||
for (int i = 0; i < gb->n_nodes; i++) {
|
||||
struct ggml_tensor * node = gb->nodes[i];
|
||||
|
||||
struct ggml_tensor * parent = ggml_graph_get_parent(gb, node);
|
||||
|
||||
if (node->src0) {
|
||||
struct ggml_tensor * parent0 = ggml_graph_get_parent(gb, node->src0);
|
||||
|
||||
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"x\"; ]\n",
|
||||
parent0 ? (void *) parent0 : (void *) node->src0,
|
||||
parent0 ? "g" : "x",
|
||||
parent ? (void *) parent : (void *) node,
|
||||
parent ? "g" : "x",
|
||||
parent ? "empty" : "vee",
|
||||
parent ? "dashed" : "solid");
|
||||
ggml_graph_dump_dot_node_edge(fp, gb, node, node->src0, "x");
|
||||
}
|
||||
|
||||
if (node->src1) {
|
||||
struct ggml_tensor * parent1 = ggml_graph_get_parent(gb, node->src1);
|
||||
ggml_graph_dump_dot_node_edge(fp, gb, node, node->src1, "y");
|
||||
}
|
||||
|
||||
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"y\"; ]\n",
|
||||
parent1 ? (void *) parent1 : (void *) node->src1,
|
||||
parent1 ? "g" : "x",
|
||||
parent ? (void *) parent : (void *) node,
|
||||
parent ? "g" : "x",
|
||||
parent ? "empty" : "vee",
|
||||
parent ? "dashed" : "solid");
|
||||
for (int j = 0; j < GGML_MAX_OPT; j++) {
|
||||
if (node->opt[j]) {
|
||||
char label[16];
|
||||
snprintf(label, sizeof(label), "opt %d", j);
|
||||
ggml_graph_dump_dot_node_edge(fp, gb, node, node->opt[j], label);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17510,15 +17563,19 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
||||
struct ggml_tensor * node = gb->leafs[i];
|
||||
|
||||
if (node->src0) {
|
||||
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"x\"; ]\n",
|
||||
(void *) node->src0, "x",
|
||||
(void *) node, "x");
|
||||
ggml_graph_dump_dot_leaf_edge(fp, node, node->src0, "x");
|
||||
}
|
||||
|
||||
if (node->src1) {
|
||||
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"y\"; ]\n",
|
||||
(void *) node->src1, "x",
|
||||
(void *) node, "x");
|
||||
ggml_graph_dump_dot_leaf_edge(fp, node, node->src1, "y");
|
||||
}
|
||||
|
||||
for (int j = 0; j < GGML_MAX_OPT; j++) {
|
||||
if (node->opt[j]) {
|
||||
char label[16];
|
||||
snprintf(label, sizeof(label), "opt %d", j);
|
||||
ggml_graph_dump_dot_leaf_edge(fp, node, node->opt[j], label);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
1
ggml.h
1
ggml.h
@@ -563,6 +563,7 @@ extern "C" {
|
||||
|
||||
GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
|
||||
GGML_API struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name);
|
||||
GGML_API struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...);
|
||||
|
||||
//
|
||||
// operations on tensors with backpropagation
|
||||
|
||||
179
llama.cpp
179
llama.cpp
@@ -182,6 +182,19 @@ struct llama_kv_cache {
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_vocab {
|
||||
using id = int32_t;
|
||||
using token = std::string;
|
||||
|
||||
struct token_score {
|
||||
token tok;
|
||||
float score;
|
||||
};
|
||||
|
||||
std::unordered_map<token, id> token_to_id;
|
||||
std::vector<token_score> id_to_token;
|
||||
};
|
||||
|
||||
struct llama_model {
|
||||
e_model type = MODEL_UNKNOWN;
|
||||
|
||||
@@ -198,10 +211,6 @@ struct llama_model {
|
||||
// context
|
||||
struct ggml_context * ctx = NULL;
|
||||
|
||||
// key + value cache for the self attention
|
||||
// TODO: move to llama_state
|
||||
struct llama_kv_cache kv_self;
|
||||
|
||||
// the model memory buffer
|
||||
llama_ctx_buffer buf;
|
||||
|
||||
@@ -215,6 +224,11 @@ struct llama_model {
|
||||
// for quantize-stats only
|
||||
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
||||
|
||||
int64_t t_load_us = 0;
|
||||
int64_t t_start_us = 0;
|
||||
|
||||
llama_vocab vocab;
|
||||
|
||||
~llama_model() {
|
||||
if (ctx) {
|
||||
ggml_free(ctx);
|
||||
@@ -233,24 +247,11 @@ struct llama_model {
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_vocab {
|
||||
using id = int32_t;
|
||||
using token = std::string;
|
||||
|
||||
struct token_score {
|
||||
token tok;
|
||||
float score;
|
||||
};
|
||||
|
||||
std::unordered_map<token, id> token_to_id;
|
||||
std::vector<token_score> id_to_token;
|
||||
};
|
||||
|
||||
struct llama_context {
|
||||
llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
||||
|
||||
std::mt19937 rng;
|
||||
|
||||
int64_t t_load_us = 0;
|
||||
int64_t t_start_us = 0;
|
||||
bool has_evaluated_once = false;
|
||||
|
||||
int64_t t_sample_us = 0;
|
||||
@@ -261,8 +262,16 @@ struct llama_context {
|
||||
int32_t n_eval = 0; // number of eval calls
|
||||
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
||||
|
||||
llama_model model;
|
||||
llama_vocab vocab;
|
||||
const llama_model & model;
|
||||
const llama_vocab & vocab;
|
||||
|
||||
bool model_owner = false;
|
||||
|
||||
int64_t t_load_us;
|
||||
int64_t t_start_us;
|
||||
|
||||
// key + value cache for the self attention
|
||||
struct llama_kv_cache kv_self;
|
||||
|
||||
size_t mem_per_token = 0;
|
||||
|
||||
@@ -1033,7 +1042,8 @@ static const char *llama_model_type_name(e_model type) {
|
||||
|
||||
static void llama_model_load_internal(
|
||||
const std::string & fname,
|
||||
llama_context & lctx,
|
||||
llama_model & model,
|
||||
llama_vocab & vocab,
|
||||
int n_ctx,
|
||||
int n_batch,
|
||||
int n_gpu_layers,
|
||||
@@ -1047,12 +1057,11 @@ static void llama_model_load_internal(
|
||||
llama_progress_callback progress_callback,
|
||||
void * progress_callback_user_data) {
|
||||
|
||||
lctx.t_start_us = ggml_time_us();
|
||||
model.t_start_us = ggml_time_us();
|
||||
|
||||
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
|
||||
|
||||
lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
|
||||
auto & model = lctx.model;
|
||||
vocab = std::move(ml->file_loaders.at(0)->vocab);
|
||||
model.hparams = ml->file_loaders.at(0)->hparams;
|
||||
model.n_gpu_layers = n_gpu_layers;
|
||||
llama_file_version file_version = ml->file_loaders.at(0)->file_version;
|
||||
@@ -1122,15 +1131,15 @@ static void llama_model_load_internal(
|
||||
|
||||
// create the ggml context
|
||||
{
|
||||
lctx.model.buf.resize(ctx_size);
|
||||
model.buf.resize(ctx_size);
|
||||
if (use_mlock) {
|
||||
lctx.model.mlock_buf.init(lctx.model.buf.addr);
|
||||
lctx.model.mlock_buf.grow_to(lctx.model.buf.size);
|
||||
model.mlock_buf.init(model.buf.addr);
|
||||
model.mlock_buf.grow_to(model.buf.size);
|
||||
}
|
||||
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ lctx.model.buf.size,
|
||||
/*.mem_buffer =*/ lctx.model.buf.addr,
|
||||
/*.mem_size =*/ model.buf.size,
|
||||
/*.mem_buffer =*/ model.buf.addr,
|
||||
/*.no_alloc =*/ ml->use_mmap,
|
||||
};
|
||||
|
||||
@@ -1311,7 +1320,7 @@ static void llama_model_load_internal(
|
||||
}
|
||||
#endif
|
||||
|
||||
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
||||
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
|
||||
|
||||
if (progress_callback) {
|
||||
progress_callback(1.0f, progress_callback_user_data);
|
||||
@@ -1321,12 +1330,13 @@ static void llama_model_load_internal(
|
||||
|
||||
// loading time will be recalculate after the first eval, so
|
||||
// we take page faults deferred by mmap() into consideration
|
||||
lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
|
||||
model.t_load_us = ggml_time_us() - model.t_start_us;
|
||||
}
|
||||
|
||||
static bool llama_model_load(
|
||||
const std::string & fname,
|
||||
llama_context & lctx,
|
||||
llama_model & model,
|
||||
llama_vocab & vocab,
|
||||
int n_ctx,
|
||||
int n_batch,
|
||||
int n_gpu_layers,
|
||||
@@ -1340,7 +1350,7 @@ static bool llama_model_load(
|
||||
llama_progress_callback progress_callback,
|
||||
void *progress_callback_user_data) {
|
||||
try {
|
||||
llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
|
||||
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
|
||||
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
||||
return true;
|
||||
} catch (const std::exception & err) {
|
||||
@@ -1378,7 +1388,7 @@ static bool llama_eval_internal(
|
||||
const auto & model = lctx.model;
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const auto & kv_self = model.kv_self;
|
||||
const auto & kv_self = lctx.kv_self;
|
||||
|
||||
LLAMA_ASSERT(!!kv_self.ctx);
|
||||
|
||||
@@ -1726,7 +1736,7 @@ static bool llama_eval_internal(
|
||||
//memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
||||
|
||||
// update kv token count
|
||||
lctx.model.kv_self.n = n_past + N;
|
||||
lctx.kv_self.n = n_past + N;
|
||||
|
||||
// extract logits
|
||||
{
|
||||
@@ -2005,9 +2015,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
|
||||
for (size_t i = 0; i < candidates->size; ++i) {
|
||||
cum_sum += candidates->data[i].p;
|
||||
|
||||
// Check if the running sum is greater than p or if we have kept at least min_keep tokens
|
||||
if (cum_sum > p && i >= min_keep) {
|
||||
last_idx = i;
|
||||
// Check if the running sum is at least p or if we have kept at least min_keep tokens
|
||||
// we set the last index to i+1 to indicate that the current iterate should be included in the set
|
||||
if (cum_sum >= p && i + 1 >= min_keep) {
|
||||
last_idx = i + 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -2634,12 +2645,39 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
// interface implementation
|
||||
//
|
||||
|
||||
struct llama_context * llama_init_from_file(
|
||||
struct llama_model * llama_load_model_from_file(
|
||||
const char * path_model,
|
||||
struct llama_context_params params) {
|
||||
ggml_time_init();
|
||||
|
||||
llama_context * ctx = new llama_context;
|
||||
llama_model * model = new llama_model;
|
||||
|
||||
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
||||
|
||||
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
||||
params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
|
||||
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
||||
delete model;
|
||||
fprintf(stderr, "%s: failed to load model\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return model;
|
||||
}
|
||||
|
||||
void llama_free_model(struct llama_model * model) {
|
||||
delete model;
|
||||
}
|
||||
|
||||
struct llama_context * llama_new_context_with_model(
|
||||
struct llama_model * model,
|
||||
struct llama_context_params params) {
|
||||
|
||||
if (!model) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
llama_context * ctx = new llama_context(*model, model->vocab);
|
||||
|
||||
if (params.seed < 0) {
|
||||
params.seed = time(NULL);
|
||||
@@ -2667,24 +2705,16 @@ struct llama_context * llama_init_from_file(
|
||||
|
||||
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
||||
|
||||
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
|
||||
params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
|
||||
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
||||
fprintf(stderr, "%s: failed to load model\n", __func__);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// reserve memory for context buffers
|
||||
if (!params.vocab_only) {
|
||||
if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
|
||||
if (!kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
|
||||
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
{
|
||||
const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v);
|
||||
const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
|
||||
fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
||||
}
|
||||
|
||||
@@ -2736,8 +2766,8 @@ struct llama_context * llama_init_from_file(
|
||||
|
||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
||||
|
||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
|
||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size, 0));
|
||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
|
||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
|
||||
|
||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
|
||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
|
||||
@@ -2748,7 +2778,23 @@ struct llama_context * llama_init_from_file(
|
||||
return ctx;
|
||||
}
|
||||
|
||||
struct llama_context * llama_init_from_file(
|
||||
const char * path_model,
|
||||
struct llama_context_params params) {
|
||||
|
||||
struct llama_model * model = llama_load_model_from_file(path_model, params);
|
||||
if (!model) {
|
||||
return nullptr;
|
||||
}
|
||||
struct llama_context * ctx = llama_new_context_with_model(model, params);
|
||||
ctx->model_owner = true;
|
||||
return ctx;
|
||||
}
|
||||
|
||||
void llama_free(struct llama_context * ctx) {
|
||||
if (ctx->model_owner) {
|
||||
delete &ctx->model;
|
||||
}
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
@@ -2765,11 +2811,9 @@ int llama_model_quantize(
|
||||
}
|
||||
}
|
||||
|
||||
int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
||||
int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
|
||||
fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
||||
|
||||
auto & model = ctx->model;
|
||||
|
||||
const int64_t t_start_lora_us = ggml_time_us();
|
||||
|
||||
auto fin = std::ifstream(path_lora, std::ios::binary);
|
||||
@@ -3012,7 +3056,16 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
||||
|
||||
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
||||
try {
|
||||
return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
|
||||
return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
|
||||
} catch (const std::exception & err) {
|
||||
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
|
||||
try {
|
||||
return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
|
||||
} catch (const std::exception & err) {
|
||||
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
||||
return 1;
|
||||
@@ -3020,7 +3073,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
|
||||
}
|
||||
|
||||
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
||||
return ctx->model.kv_self.n;
|
||||
return ctx->kv_self.n;
|
||||
}
|
||||
|
||||
#define LLAMA_MAX_RNG_STATE (64*1024)
|
||||
@@ -3045,7 +3098,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
||||
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
|
||||
const size_t s_kv_size = sizeof(size_t);
|
||||
const size_t s_kv_ntok = sizeof(int);
|
||||
const size_t s_kv = ctx->model.kv_self.buf.size;
|
||||
const size_t s_kv = ctx->kv_self.buf.size;
|
||||
|
||||
const size_t s_total = (
|
||||
+ s_rng_size
|
||||
@@ -3111,7 +3164,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
||||
|
||||
// copy kv cache
|
||||
{
|
||||
const auto & kv_self = ctx->model.kv_self;
|
||||
const auto & kv_self = ctx->kv_self;
|
||||
const auto & hparams = ctx->model.hparams;
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_embd = hparams.n_embd;
|
||||
@@ -3215,7 +3268,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
||||
|
||||
// set kv cache
|
||||
{
|
||||
const auto & kv_self = ctx->model.kv_self;
|
||||
const auto & kv_self = ctx->kv_self;
|
||||
const auto & hparams = ctx->model.hparams;
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_embd = hparams.n_embd;
|
||||
@@ -3259,7 +3312,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
||||
ggml_free(cpy_ctx);
|
||||
}
|
||||
|
||||
ctx->model.kv_self.n = kv_ntok;
|
||||
ctx->kv_self.n = kv_ntok;
|
||||
}
|
||||
|
||||
const size_t nread = inp - src;
|
||||
@@ -3506,6 +3559,6 @@ const char * llama_print_system_info(void) {
|
||||
}
|
||||
|
||||
// For internal test use
|
||||
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
||||
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
||||
return ctx->model.tensors_by_name;
|
||||
}
|
||||
|
||||
35
llama.h
35
llama.h
@@ -26,6 +26,14 @@
|
||||
# define LLAMA_API
|
||||
#endif
|
||||
|
||||
#ifdef __GNUC__
|
||||
# define DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
|
||||
#elif defined(_MSC_VER)
|
||||
# define DEPRECATED(func, hint) __declspec(deprecated(hint)) func
|
||||
#else
|
||||
# define DEPRECATED(func, hint) func
|
||||
#endif
|
||||
|
||||
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
|
||||
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
||||
#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
|
||||
@@ -53,6 +61,7 @@ extern "C" {
|
||||
// TODO: show sample usage
|
||||
//
|
||||
|
||||
struct llama_model;
|
||||
struct llama_context;
|
||||
|
||||
typedef int llama_token;
|
||||
@@ -136,12 +145,23 @@ extern "C" {
|
||||
|
||||
LLAMA_API int64_t llama_time_us();
|
||||
|
||||
LLAMA_API struct llama_model * llama_load_model_from_file(
|
||||
const char * path_model,
|
||||
struct llama_context_params params);
|
||||
|
||||
LLAMA_API void llama_free_model(struct llama_model * model);
|
||||
|
||||
LLAMA_API struct llama_context * llama_new_context_with_model(
|
||||
struct llama_model * model,
|
||||
struct llama_context_params params);
|
||||
|
||||
// Various functions for loading a ggml llama model.
|
||||
// Allocate (almost) all memory needed for the model.
|
||||
// Return NULL on failure
|
||||
LLAMA_API struct llama_context * llama_init_from_file(
|
||||
LLAMA_API DEPRECATED(struct llama_context * llama_init_from_file(
|
||||
const char * path_model,
|
||||
struct llama_context_params params);
|
||||
struct llama_context_params params),
|
||||
"please use llama_load_model_from_file combined with llama_new_context_with_model instead");
|
||||
|
||||
// Frees all allocated memory
|
||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||
@@ -158,8 +178,15 @@ extern "C" {
|
||||
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
||||
// will be applied on top of the previous one
|
||||
// Returns 0 on success
|
||||
LLAMA_API int llama_apply_lora_from_file(
|
||||
LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
|
||||
struct llama_context * ctx,
|
||||
const char * path_lora,
|
||||
const char * path_base_model,
|
||||
int n_threads),
|
||||
"please use llama_model_apply_lora_from_file instead");
|
||||
|
||||
LLAMA_API int llama_model_apply_lora_from_file(
|
||||
const struct llama_model * model,
|
||||
const char * path_lora,
|
||||
const char * path_base_model,
|
||||
int n_threads);
|
||||
@@ -310,7 +337,7 @@ extern "C" {
|
||||
#include <string>
|
||||
struct ggml_tensor;
|
||||
|
||||
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
|
||||
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
|
||||
#include "ggml.h"
|
||||
|
||||
#include <math.h>
|
||||
@@ -5,6 +6,10 @@
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
#define MAX_NARGS 3
|
||||
|
||||
#undef MIN
|
||||
@@ -197,8 +202,23 @@ bool check_gradient(
|
||||
float max_error_abs,
|
||||
float max_error_rel) {
|
||||
|
||||
static int n_threads = -1;
|
||||
if (n_threads < 0) {
|
||||
n_threads = GGML_DEFAULT_N_THREADS;
|
||||
|
||||
const char *env = getenv("GGML_N_THREADS");
|
||||
if (env) {
|
||||
n_threads = atoi(env);
|
||||
}
|
||||
|
||||
printf("GGML_N_THREADS = %d\n", n_threads);
|
||||
}
|
||||
|
||||
struct ggml_cgraph gf = ggml_build_forward (f);
|
||||
gf.n_threads = n_threads;
|
||||
|
||||
struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
|
||||
gb.n_threads = n_threads;
|
||||
|
||||
ggml_graph_compute(ctx0, &gf);
|
||||
ggml_graph_reset (&gf);
|
||||
|
||||
@@ -181,6 +181,7 @@ int main(void) {
|
||||
|
||||
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 0);
|
||||
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f}, 0.7f);
|
||||
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 0.8f);
|
||||
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1);
|
||||
|
||||
test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f);
|
||||
|
||||
@@ -28,6 +28,7 @@ int main(int argc, char **argv) {
|
||||
|
||||
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
|
||||
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
|
||||
// load the vocab
|
||||
@@ -36,10 +37,18 @@ int main(int argc, char **argv) {
|
||||
|
||||
lparams.vocab_only = true;
|
||||
|
||||
ctx = llama_init_from_file(fname.c_str(), lparams);
|
||||
model = llama_load_model_from_file(fname.c_str(), lparams);
|
||||
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
ctx = llama_new_context_with_model(model, lparams);
|
||||
|
||||
if (ctx == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||
llama_free_model(model);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
@@ -48,6 +57,8 @@ int main(int argc, char **argv) {
|
||||
|
||||
if (n_vocab != 32000) {
|
||||
fprintf(stderr, "%s : expected 32000 tokens, got %d\n", __func__, n_vocab);
|
||||
llama_free_model(model);
|
||||
llama_free(ctx);
|
||||
return 2;
|
||||
}
|
||||
|
||||
@@ -77,10 +88,13 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
llama_free_model(model);
|
||||
llama_free(ctx);
|
||||
return 3;
|
||||
}
|
||||
}
|
||||
|
||||
llama_free_model(model);
|
||||
llama_free(ctx);
|
||||
|
||||
return 0;
|
||||
|
||||
Reference in New Issue
Block a user