mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-05 13:53:23 +02:00
Compare commits
11 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
feff4aa846 | ||
|
|
0abc6a2c25 | ||
|
|
bd35cb0ae3 | ||
|
|
78203641fe | ||
|
|
e6b7801bd1 | ||
|
|
e665744317 | ||
|
|
d4c3c10fad | ||
|
|
2a825116b6 | ||
|
|
4dc4f5f14a | ||
|
|
c837981bba | ||
|
|
3c26a1644d |
16
.github/workflows/build.yml
vendored
16
.github/workflows/build.yml
vendored
@@ -375,7 +375,7 @@ jobs:
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
@@ -401,7 +401,7 @@ jobs:
|
||||
continue-on-error: true
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: add oneAPI to apt
|
||||
shell: bash
|
||||
@@ -442,7 +442,7 @@ jobs:
|
||||
continue-on-error: true
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: add oneAPI to apt
|
||||
shell: bash
|
||||
@@ -546,7 +546,7 @@ jobs:
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v1
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
@@ -576,7 +576,7 @@ jobs:
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v1
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
@@ -610,7 +610,7 @@ jobs:
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v1
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
@@ -969,14 +969,14 @@ jobs:
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Install
|
||||
id: depends
|
||||
run: |
|
||||
$ErrorActionPreference = "Stop"
|
||||
write-host "Downloading AMD HIP SDK Installer"
|
||||
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
|
||||
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
|
||||
write-host "Installing AMD HIP SDK"
|
||||
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
|
||||
write-host "Completed AMD HIP SDK installation"
|
||||
|
||||
1
.github/workflows/server.yml
vendored
1
.github/workflows/server.yml
vendored
@@ -173,6 +173,7 @@ jobs:
|
||||
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
|
||||
run: |
|
||||
cd examples/server/tests
|
||||
$env:PYTHONIOENCODING = ":replace"
|
||||
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
|
||||
|
||||
- name: Slow tests
|
||||
|
||||
@@ -139,10 +139,16 @@ set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location o
|
||||
# determining _precisely_ which defines are necessary for the llama-config
|
||||
# package.
|
||||
#
|
||||
set(GGML_TRANSIENT_DEFINES)
|
||||
get_target_property(GGML_DIRECTORY ggml SOURCE_DIR)
|
||||
get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS)
|
||||
if (GGML_DIR_DEFINES)
|
||||
list(APPEND GGML_TRANSIENT_DEFINES ${GGML_DIR_DEFINES})
|
||||
endif()
|
||||
get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
|
||||
set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES})
|
||||
if (GGML_TARGET_DEFINES)
|
||||
list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES})
|
||||
endif()
|
||||
get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
|
||||
|
||||
set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)
|
||||
|
||||
1
Makefile
1
Makefile
@@ -1440,6 +1440,7 @@ llama-server: \
|
||||
examples/server/system-prompts.js.hpp \
|
||||
examples/server/prompt-formats.js.hpp \
|
||||
examples/server/json-schema-to-grammar.mjs.hpp \
|
||||
examples/server/loading.html.hpp \
|
||||
common/json.hpp \
|
||||
common/stb_image.h \
|
||||
$(OBJ_ALL)
|
||||
|
||||
@@ -720,6 +720,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
||||
params.prompt = value;
|
||||
}
|
||||
));
|
||||
add_opt(llama_arg(
|
||||
{"--no-perf"},
|
||||
format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
|
||||
[](gpt_params & params) {
|
||||
params.no_perf = true;
|
||||
params.sparams.no_perf = true;
|
||||
}
|
||||
).set_env("LLAMA_ARG_NO_PERF"));
|
||||
add_opt(llama_arg(
|
||||
{"-f", "--file"}, "FNAME",
|
||||
"a file containing the prompt (default: none)",
|
||||
|
||||
@@ -820,7 +820,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
||||
}
|
||||
llama_kv_cache_clear(lctx);
|
||||
llama_synchronize(lctx);
|
||||
llama_perf_reset(lctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_reset(lctx);
|
||||
}
|
||||
|
||||
iparams.model = model;
|
||||
@@ -916,6 +916,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
||||
cparams.offload_kqv = !params.no_kv_offload;
|
||||
cparams.flash_attn = params.flash_attn;
|
||||
cparams.no_perf = params.no_perf;
|
||||
|
||||
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
|
||||
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
|
||||
|
||||
@@ -124,6 +124,7 @@ struct gpt_sampler_params {
|
||||
float mirostat_eta = 0.10f; // learning rate
|
||||
bool penalize_nl = false; // consider newlines as a repeatable token
|
||||
bool ignore_eos = false;
|
||||
bool no_perf = false; // disable performance metrics
|
||||
|
||||
std::vector<enum gpt_sampler_type> samplers = {
|
||||
GPT_SAMPLER_TYPE_TOP_K,
|
||||
@@ -246,6 +247,7 @@ struct gpt_params {
|
||||
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
||||
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
||||
bool flash_attn = false; // flash attention
|
||||
bool no_perf = false; // disable performance metrics
|
||||
|
||||
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||
bool logits_all = false; // return logits for all tokens in the batch
|
||||
|
||||
@@ -142,7 +142,7 @@ std::string gpt_sampler_params::print() const {
|
||||
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
|
||||
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
|
||||
|
||||
lparams.no_perf = false; // TODO: control via params
|
||||
lparams.no_perf = params.no_perf;
|
||||
|
||||
auto * result = new gpt_sampler {
|
||||
/* .params = */ params,
|
||||
@@ -257,10 +257,10 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
|
||||
// TODO: measure grammar performance
|
||||
|
||||
if (gsmpl) {
|
||||
llama_perf_print(gsmpl->chain, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
|
||||
llama_perf_sampler_print(gsmpl->chain);
|
||||
}
|
||||
if (ctx) {
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_print(ctx);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -626,6 +626,9 @@ class Model:
|
||||
if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
|
||||
# ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
|
||||
res = "exaone"
|
||||
if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085":
|
||||
# ref: https://huggingface.co/microsoft/phi-2
|
||||
res = "phi-2"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
|
||||
@@ -98,6 +98,7 @@ models = [
|
||||
{'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
|
||||
{'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
|
||||
{"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
|
||||
{"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -363,7 +363,13 @@ if __name__ == '__main__':
|
||||
yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)))
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
dest = super().modify_tensors(data_torch, name, bid)
|
||||
dest = list(super().modify_tensors(data_torch, name, bid))
|
||||
# some archs may have the same tensor for lm_head and output (tie word embeddings)
|
||||
# in this case, adapters targeting lm_head will fail when using llama-export-lora
|
||||
# therefore, we ignore them for now
|
||||
# see: https://github.com/ggerganov/llama.cpp/issues/9065
|
||||
if name == "lm_head.weight" and len(dest) == 0:
|
||||
raise ValueError("lm_head is present in adapter, but is ignored in base model")
|
||||
for dest_name, dest_data in dest:
|
||||
assert isinstance(dest_data, LoraTorchTensor)
|
||||
lora_a, lora_b = dest_data.get_lora_A_B()
|
||||
|
||||
@@ -187,7 +187,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
LOG_TEE("\n");
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_print(ctx);
|
||||
|
||||
llama_batch_free(batch);
|
||||
|
||||
|
||||
@@ -200,8 +200,8 @@ let t_main_end = ggml_time_us()
|
||||
|
||||
print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n")
|
||||
|
||||
llama_perf_print(UnsafeRawPointer(context), LLAMA_PERF_TYPE_CONTEXT)
|
||||
llama_perf_print(UnsafeRawPointer(smpl), LLAMA_PERF_TYPE_SAMPLER_CHAIN)
|
||||
llama_perf_sampler_print(smpl)
|
||||
llama_perf_context_print(context)
|
||||
|
||||
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
|
||||
let utf8Count = text.utf8.count
|
||||
|
||||
@@ -229,8 +229,8 @@ int main(int argc, char ** argv) {
|
||||
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
||||
|
||||
LOG_TEE("\n");
|
||||
llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_sampler_print(smpl);
|
||||
llama_perf_context_print(ctx);
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
|
||||
@@ -306,7 +306,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
LOG_TEE("\n");
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_print(ctx);
|
||||
|
||||
// clean up
|
||||
llama_batch_free(batch);
|
||||
|
||||
@@ -182,7 +182,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
LOG_TEE("\n");
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_print(ctx);
|
||||
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
@@ -637,7 +637,7 @@ int main(int argc, char ** argv) {
|
||||
g_collector.save_imatrix();
|
||||
|
||||
LOG_TEE("\n");
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_print(ctx);
|
||||
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
@@ -1630,7 +1630,7 @@ int main(int argc, char ** argv) {
|
||||
fflush(p_err->fout);
|
||||
}
|
||||
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_print(ctx);
|
||||
|
||||
llama_free(ctx);
|
||||
|
||||
|
||||
@@ -39,7 +39,7 @@ python ./examples/llava/llava_surgery.py -m path/to/MobileVLM-1.7B
|
||||
3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
|
||||
|
||||
```sh
|
||||
python ./examples/llava/convert_image_encoder_to_gguf \
|
||||
python ./examples/llava/convert_image_encoder_to_gguf.py \
|
||||
-m path/to/clip-vit-large-patch14-336 \
|
||||
--llava-projector path/to/MobileVLM-1.7B/llava.projector \
|
||||
--output-dir path/to/MobileVLM-1.7B \
|
||||
@@ -47,7 +47,7 @@ python ./examples/llava/convert_image_encoder_to_gguf \
|
||||
```
|
||||
|
||||
```sh
|
||||
python ./examples/llava/convert_image_encoder_to_gguf \
|
||||
python ./examples/llava/convert_image_encoder_to_gguf.py \
|
||||
-m path/to/clip-vit-large-patch14-336 \
|
||||
--llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
|
||||
--output-dir path/to/MobileVLM-1.7B_V2 \
|
||||
@@ -57,12 +57,12 @@ python ./examples/llava/convert_image_encoder_to_gguf \
|
||||
4. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
|
||||
|
||||
```sh
|
||||
python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B
|
||||
python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B --skip-unknown
|
||||
```
|
||||
|
||||
5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
|
||||
5. Use `quantize` to convert LLaMA part's DataType from `fp32` to `q4_k`
|
||||
```sh
|
||||
./llama-quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
|
||||
./llama-quantize path/to/MobileVLM-1.7B/ggml-model-F32.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
|
||||
```
|
||||
|
||||
Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.
|
||||
|
||||
@@ -308,7 +308,7 @@ int main(int argc, char ** argv) {
|
||||
// process the prompt
|
||||
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
||||
|
||||
llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_print(ctx_llava->ctx_llama);
|
||||
llava_image_embed_free(image_embed);
|
||||
ctx_llava->model = NULL;
|
||||
llava_free(ctx_llava);
|
||||
@@ -325,7 +325,7 @@ int main(int argc, char ** argv) {
|
||||
// process the prompt
|
||||
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
||||
|
||||
llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_print(ctx_llava->ctx_llama);
|
||||
llava_image_embed_free(image_embed);
|
||||
ctx_llava->model = NULL;
|
||||
llava_free(ctx_llava);
|
||||
|
||||
@@ -319,7 +319,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_print(ctx_llava->ctx_llama);
|
||||
|
||||
ctx_llava->model = NULL;
|
||||
llava_free(ctx_llava);
|
||||
|
||||
@@ -240,8 +240,7 @@ int main(int argc, char ** argv){
|
||||
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
||||
|
||||
LOG_TEE("\ntarget:\n\n");
|
||||
llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
gpt_perf_print(ctx, smpl);
|
||||
|
||||
gpt_sampler_free(smpl);
|
||||
|
||||
|
||||
@@ -415,7 +415,7 @@ int main(int argc, char ** argv) {
|
||||
LOG_TEE("\n");
|
||||
|
||||
// TODO: print sampling/grammar timings for all clients
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_print(ctx);
|
||||
|
||||
llama_batch_free(batch);
|
||||
|
||||
|
||||
@@ -256,7 +256,7 @@ int main(int argc, char ** argv) {
|
||||
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
||||
|
||||
LOG_TEE("\n");
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_print(ctx);
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
|
||||
@@ -2047,7 +2047,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
LOG_TEE("\n");
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_print(ctx);
|
||||
write_logfile(ctx, params, model, results);
|
||||
|
||||
llama_free(ctx);
|
||||
|
||||
@@ -292,7 +292,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
LOG_TEE("\n");
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_print(ctx);
|
||||
|
||||
// clean up
|
||||
llama_batch_free(query_batch);
|
||||
|
||||
@@ -30,6 +30,7 @@ set(PUBLIC_ASSETS
|
||||
system-prompts.js
|
||||
prompt-formats.js
|
||||
json-schema-to-grammar.mjs
|
||||
loading.html
|
||||
)
|
||||
|
||||
foreach(asset ${PUBLIC_ASSETS})
|
||||
|
||||
@@ -407,9 +407,44 @@ Notice that each `probs` is an array of length `n_probs`.
|
||||
|
||||
*Options:*
|
||||
|
||||
`content`: Set the text to tokenize.
|
||||
`content`: (Required) The text to tokenize.
|
||||
|
||||
`add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false`
|
||||
`add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false`
|
||||
|
||||
`with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs. Default: `false`
|
||||
|
||||
**Response:**
|
||||
|
||||
Returns a JSON object with a `tokens` field containing the tokenization result. The `tokens` array contains either just token IDs or objects with `id` and `piece` fields, depending on the `with_pieces` parameter. The piece field is a string if the piece is valid unicode or a list of bytes otherwise.
|
||||
|
||||
|
||||
If `with_pieces` is `false`:
|
||||
```json
|
||||
{
|
||||
"tokens": [123, 456, 789]
|
||||
}
|
||||
```
|
||||
|
||||
If `with_pieces` is `true`:
|
||||
```json
|
||||
{
|
||||
"tokens": [
|
||||
{"id": 123, "piece": "Hello"},
|
||||
{"id": 456, "piece": " world"},
|
||||
{"id": 789, "piece": "!"}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k
|
||||
```json
|
||||
{
|
||||
"tokens": [
|
||||
{"id": 198, "piece": [195]}, // hex C3
|
||||
{"id": 164, "piece": [161]} // hex A1
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### POST `/detokenize`: Convert tokens to text
|
||||
|
||||
|
||||
12
examples/server/public/loading.html
Normal file
12
examples/server/public/loading.html
Normal file
@@ -0,0 +1,12 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="refresh" content="5">
|
||||
</head>
|
||||
<body>
|
||||
<div id="loading">
|
||||
The model is loading. Please wait.<br/>
|
||||
The user interface will appear soon.
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
@@ -28,6 +28,7 @@
|
||||
#include "system-prompts.js.hpp"
|
||||
#include "prompt-formats.js.hpp"
|
||||
#include "json-schema-to-grammar.mjs.hpp"
|
||||
#include "loading.html.hpp"
|
||||
|
||||
#include <atomic>
|
||||
#include <chrono>
|
||||
@@ -2592,10 +2593,16 @@ int main(int argc, char ** argv) {
|
||||
return false;
|
||||
};
|
||||
|
||||
auto middleware_server_state = [&res_error, &state](const httplib::Request &, httplib::Response & res) {
|
||||
auto middleware_server_state = [&res_error, &state](const httplib::Request & req, httplib::Response & res) {
|
||||
server_state current_state = state.load();
|
||||
if (current_state == SERVER_STATE_LOADING_MODEL) {
|
||||
res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
|
||||
auto tmp = string_split(req.path, '.');
|
||||
if (req.path == "/" || tmp.back() == "html") {
|
||||
res.set_content(reinterpret_cast<const char*>(loading_html), loading_html_len, "text/html; charset=utf-8");
|
||||
res.status = 503;
|
||||
} else {
|
||||
res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
@@ -3013,12 +3020,39 @@ int main(int argc, char ** argv) {
|
||||
const auto handle_tokenize = [&ctx_server, &res_ok](const httplib::Request & req, httplib::Response & res) {
|
||||
const json body = json::parse(req.body);
|
||||
|
||||
std::vector<llama_token> tokens;
|
||||
json tokens_response = json::array();
|
||||
if (body.count("content") != 0) {
|
||||
const bool add_special = json_value(body, "add_special", false);
|
||||
tokens = ctx_server.tokenize(body.at("content"), add_special);
|
||||
const bool with_pieces = json_value(body, "with_pieces", false);
|
||||
std::vector<llama_token> tokens = ctx_server.tokenize(body.at("content"), add_special);
|
||||
|
||||
if (with_pieces) {
|
||||
for (const auto& token : tokens) {
|
||||
std::string piece = llama_token_to_piece(ctx_server.ctx, token);
|
||||
json piece_json;
|
||||
|
||||
// Check if the piece is valid UTF-8
|
||||
if (is_valid_utf8(piece)) {
|
||||
piece_json = piece;
|
||||
} else {
|
||||
// If not valid UTF-8, store as array of byte values
|
||||
piece_json = json::array();
|
||||
for (unsigned char c : piece) {
|
||||
piece_json.push_back(static_cast<int>(c));
|
||||
}
|
||||
}
|
||||
|
||||
tokens_response.push_back({
|
||||
{"id", token},
|
||||
{"piece", piece_json}
|
||||
});
|
||||
}
|
||||
} else {
|
||||
tokens_response = tokens;
|
||||
}
|
||||
}
|
||||
const json data = format_tokenizer_response(tokens);
|
||||
|
||||
const json data = format_tokenizer_response(tokens_response);
|
||||
res_ok(res, data);
|
||||
};
|
||||
|
||||
|
||||
@@ -105,6 +105,14 @@ Feature: llama.cpp server
|
||||
Given first token is removed
|
||||
Then tokens can be detokenized
|
||||
|
||||
Scenario: Tokenize with pieces
|
||||
When tokenizing with pieces:
|
||||
"""
|
||||
What is the capital of Germany?
|
||||
媽
|
||||
"""
|
||||
Then tokens are given with pieces
|
||||
|
||||
Scenario: Models available
|
||||
Given available models
|
||||
Then 1 models are supported
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
@@ -697,6 +700,32 @@ def step_tokenize_set_add_special(context):
|
||||
context.tokenize_add_special = True
|
||||
|
||||
|
||||
@step("tokenizing with pieces")
|
||||
@async_run_until_complete
|
||||
async def step_tokenize_with_pieces(context):
|
||||
context.tokenized_text = context_text(context)
|
||||
async with aiohttp.ClientSession() as session:
|
||||
tokenize_args = {"content": context.tokenized_text, "with_pieces": True}
|
||||
if getattr(context, "tokenize_add_special", None) is not None:
|
||||
tokenize_args["add_special"] = context.tokenize_add_special
|
||||
|
||||
async with session.post(
|
||||
f"{context.base_url}/tokenize", json=tokenize_args
|
||||
) as response:
|
||||
assert response.status == 200
|
||||
tokenize_json = await response.json()
|
||||
context.tokens_with_pieces = tokenize_json["tokens"]
|
||||
|
||||
|
||||
@step("tokens are given with pieces")
|
||||
@async_run_until_complete
|
||||
async def step_tokenize_with_pieces(context):
|
||||
# Verify that the response contains both token IDs and pieces
|
||||
assert all(
|
||||
"id" in token and "piece" in token for token in context.tokens_with_pieces
|
||||
)
|
||||
|
||||
|
||||
@step('tokenizing')
|
||||
@async_run_until_complete
|
||||
async def step_tokenize(context):
|
||||
|
||||
@@ -616,7 +616,40 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
|
||||
return res;
|
||||
}
|
||||
|
||||
static json format_tokenizer_response(const std::vector<llama_token> & tokens) {
|
||||
static bool is_valid_utf8(const std::string & str) {
|
||||
const unsigned char* bytes = reinterpret_cast<const unsigned char*>(str.data());
|
||||
const unsigned char* end = bytes + str.length();
|
||||
|
||||
while (bytes < end) {
|
||||
if (*bytes <= 0x7F) {
|
||||
// 1-byte sequence (0xxxxxxx)
|
||||
bytes++;
|
||||
} else if ((*bytes & 0xE0) == 0xC0) {
|
||||
// 2-byte sequence (110xxxxx 10xxxxxx)
|
||||
if (end - bytes < 2 || (bytes[1] & 0xC0) != 0x80)
|
||||
return false;
|
||||
bytes += 2;
|
||||
} else if ((*bytes & 0xF0) == 0xE0) {
|
||||
// 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
|
||||
if (end - bytes < 3 || (bytes[1] & 0xC0) != 0x80 || (bytes[2] & 0xC0) != 0x80)
|
||||
return false;
|
||||
bytes += 3;
|
||||
} else if ((*bytes & 0xF8) == 0xF0) {
|
||||
// 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
|
||||
if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 ||
|
||||
(bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80)
|
||||
return false;
|
||||
bytes += 4;
|
||||
} else {
|
||||
// Invalid UTF-8 lead byte
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static json format_tokenizer_response(const json & tokens) {
|
||||
return json {
|
||||
{"tokens", tokens}
|
||||
};
|
||||
|
||||
@@ -154,8 +154,8 @@ int main(int argc, char ** argv) {
|
||||
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
||||
|
||||
LOG_TEE("\n");
|
||||
llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_sampler_print(smpl);
|
||||
llama_perf_context_print(ctx);
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
|
||||
@@ -616,7 +616,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
LOG_TEE("\ndraft:\n\n");
|
||||
// TODO: print sampling/grammar timings for all drafts
|
||||
llama_perf_print(ctx_dft, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_print(ctx_dft);
|
||||
|
||||
LOG_TEE("\ntarget:\n\n");
|
||||
gpt_perf_print(ctx_tgt, smpl);
|
||||
|
||||
@@ -80,6 +80,13 @@ ggml_backend_cann_buffer_type(int32_t device);
|
||||
*/
|
||||
GGML_API GGML_CALL int32_t ggml_backend_cann_get_device_count(void);
|
||||
|
||||
/**
|
||||
* @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
|
||||
*
|
||||
* @return A pointer to the host buffer type interface.
|
||||
*/
|
||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
|
||||
|
||||
/**
|
||||
* @brief Retrieves the description of a specific CANN device.
|
||||
*
|
||||
|
||||
@@ -1221,6 +1221,116 @@ ggml_backend_cann_buffer_type(int32_t device) {
|
||||
return &ggml_backend_cann_buffer_types[device];
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Retrieves the name associated with a CANN host buffer type.
|
||||
*
|
||||
* This function returns the descriptive name associated with the specified
|
||||
* CANN host buffer type context.
|
||||
*
|
||||
* @param buft Pointer to the host buffer type context.
|
||||
* @return Const pointer to the C-style string containing the name.
|
||||
*/
|
||||
GGML_CALL static const char * ggml_backend_cann_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
||||
return "CANN_Host";
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Retrieves the name associated with a CANN host buffer.
|
||||
*
|
||||
* This function returns the descriptive name associated with the specified
|
||||
* CANN host buffer context.
|
||||
*
|
||||
* @param buft Pointer to the host buffer context.
|
||||
* @return Const pointer to the C-style string containing the name.
|
||||
*/
|
||||
GGML_CALL static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buffer) {
|
||||
return "CANN_Host";
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Free resources associated with a CANN host buffer.
|
||||
*
|
||||
* This function frees the resources associated with a CANN host buffer, including
|
||||
* its context.
|
||||
*
|
||||
* @param buffer The CANN host buffer to free.
|
||||
*/
|
||||
GGML_CALL static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
|
||||
ACL_CHECK(aclrtFreeHost(buffer->context));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Allocates a new CANN host buffer of the specified size.
|
||||
*
|
||||
* This function allocates a new CANN host buffer with the given size.
|
||||
* @param size Size in bytes of the host buffer to allocate.
|
||||
* @return Pointer to the allocated host buffer, or nullptr if allocation fails.
|
||||
*/
|
||||
static void * ggml_cann_host_malloc(size_t size) {
|
||||
if (getenv("GGML_CANN_NO_PINNED") != nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void * hostPtr = nullptr;
|
||||
aclError err = aclrtMallocHost((void **) &hostPtr, size);
|
||||
if (err != ACL_SUCCESS) {
|
||||
|
||||
GGML_CANN_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
||||
size / 1024.0 / 1024.0, aclGetRecentErrMsg());
|
||||
return nullptr;
|
||||
}
|
||||
return hostPtr;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Allocates a new CANN host buffer of the specified type and size.
|
||||
*
|
||||
* @param buft Pointer to the host buffer type context.
|
||||
* @param size Size in bytes of the host buffer to allocate.
|
||||
* @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails.
|
||||
*/
|
||||
GGML_CALL static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
void * hostPtr = ggml_cann_host_malloc(size);
|
||||
|
||||
if (hostPtr == nullptr) {
|
||||
// fallback to cpu buffer
|
||||
return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
|
||||
}
|
||||
|
||||
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size);
|
||||
buffer->buft = buft;
|
||||
buffer->iface.get_name = ggml_backend_cann_host_buffer_name;
|
||||
buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Interface for managing CANN host buffer types in the GGML backend.
|
||||
*
|
||||
* Provides function pointers for allocating, querying properties, and managing
|
||||
* memory for CANN buffer types in the GGML backend.
|
||||
*/
|
||||
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
|
||||
static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_cann_host_buffer_type_name,
|
||||
/* .alloc_buffer = */ ggml_backend_cann_host_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
||||
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
||||
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
||||
},
|
||||
/* .context = */ nullptr,
|
||||
};
|
||||
|
||||
return &ggml_backend_cann_buffer_type_host;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Computes the forward operation for a given tensor using CANN
|
||||
* operations.
|
||||
|
||||
@@ -343,7 +343,7 @@ extern "C" {
|
||||
bool embeddings; // if true, extract embeddings (together with logits)
|
||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
||||
//bool no_perf; // whether to measure performance timings, TODO: implement
|
||||
bool no_perf; // whether to measure performance timings
|
||||
|
||||
// Abort callback
|
||||
// if it returns true, execution of llama_decode() will be aborted
|
||||
@@ -1056,6 +1056,9 @@ extern "C" {
|
||||
LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
|
||||
LLAMA_API int llama_sampler_chain_n (const struct llama_sampler * chain);
|
||||
|
||||
// after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
|
||||
LLAMA_API struct llama_sampler * llama_sampler_chain_remove( struct llama_sampler * chain, int32_t i);
|
||||
|
||||
// available samplers:
|
||||
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_greedy (void);
|
||||
@@ -1173,13 +1176,30 @@ extern "C" {
|
||||
// NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
|
||||
//
|
||||
|
||||
enum llama_perf_type {
|
||||
LLAMA_PERF_TYPE_CONTEXT = 0,
|
||||
LLAMA_PERF_TYPE_SAMPLER_CHAIN = 1,
|
||||
struct llama_perf_context_data {
|
||||
double t_start_ms;
|
||||
double t_load_ms;
|
||||
double t_p_eval_ms;
|
||||
double t_eval_ms;
|
||||
|
||||
int32_t n_p_eval;
|
||||
int32_t n_eval;
|
||||
};
|
||||
|
||||
LLAMA_API void llama_perf_print(const void * ctx, enum llama_perf_type type);
|
||||
LLAMA_API void llama_perf_reset( void * ctx, enum llama_perf_type type);
|
||||
struct llama_perf_sampler_data {
|
||||
double t_sample_ms;
|
||||
|
||||
int32_t n_sample;
|
||||
};
|
||||
|
||||
LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx);
|
||||
LLAMA_API void llama_perf_context_print(const struct llama_context * ctx);
|
||||
LLAMA_API void llama_perf_context_reset( struct llama_context * ctx);
|
||||
|
||||
// NOTE: the following work only with samplers constructed via llama_sampler_chain_init
|
||||
LLAMA_API struct llama_perf_sampler_data llama_perf_sampler (const struct llama_sampler * chain);
|
||||
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
|
||||
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
|
||||
|
||||
LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
|
||||
|
||||
|
||||
@@ -349,13 +349,26 @@ void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler
|
||||
struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i) {
|
||||
const auto * p = (const llama_sampler_chain *) chain->ctx;
|
||||
|
||||
if (i < 0 || i >= (int32_t) p->samplers.size()) {
|
||||
if (i < 0 || (size_t) i >= p->samplers.size()) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return p->samplers[i];
|
||||
}
|
||||
|
||||
struct llama_sampler * llama_sampler_chain_remove(struct llama_sampler * chain, int32_t i) {
|
||||
auto * p = (llama_sampler_chain *) chain->ctx;
|
||||
|
||||
if (i < 0 || (size_t) i >= p->samplers.size()) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto * result = p->samplers[i];
|
||||
p->samplers.erase(p->samplers.begin() + i);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
int llama_sampler_chain_n(const struct llama_sampler * chain) {
|
||||
const auto * p = (const llama_sampler_chain *) chain->ctx;
|
||||
|
||||
@@ -1656,3 +1669,37 @@ uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) {
|
||||
|
||||
return LLAMA_DEFAULT_SEED;
|
||||
}
|
||||
|
||||
// perf
|
||||
|
||||
struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * chain) {
|
||||
struct llama_perf_sampler_data data = {};
|
||||
|
||||
if (chain == nullptr || chain->iface != &llama_sampler_chain_i) {
|
||||
GGML_ABORT("%s: invalid sampler passed - requires a sampler created with llama_sampler_chain_init()\n", __func__);
|
||||
}
|
||||
|
||||
const auto * ctx = (const struct llama_sampler_chain *) chain->ctx;
|
||||
|
||||
data.t_sample_ms = 1e-3 * ctx->t_sample_us;
|
||||
data.n_sample = std::max(0, ctx->n_sample);
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
void llama_perf_sampler_print(const struct llama_sampler * chain) {
|
||||
const auto data = llama_perf_sampler(chain);
|
||||
|
||||
LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
__func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
|
||||
}
|
||||
|
||||
void llama_perf_sampler_reset(struct llama_sampler * chain) {
|
||||
if (chain == nullptr || chain->iface != &llama_sampler_chain_i) {
|
||||
GGML_ABORT("%s: invalid sampler passed - requires a sampler created with llama_sampler_chain_init()\n", __func__);
|
||||
}
|
||||
|
||||
auto * ctx = (struct llama_sampler_chain *) chain->ctx;
|
||||
|
||||
ctx->t_sample_us = ctx->n_sample = 0;
|
||||
}
|
||||
|
||||
107
src/llama.cpp
107
src/llama.cpp
@@ -2156,6 +2156,10 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
|
||||
if (host_buffer) {
|
||||
buft = ggml_backend_sycl_host_buffer_type();
|
||||
}
|
||||
#elif defined(GGML_USE_CANN)
|
||||
if (host_buffer) {
|
||||
buft = ggml_backend_cann_host_buffer_type();
|
||||
}
|
||||
#elif defined(GGML_USE_CPU_HBM)
|
||||
buft = ggml_backend_cpu_hbm_buffer_type();
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
@@ -2482,6 +2486,7 @@ struct llama_cparams {
|
||||
bool causal_attn;
|
||||
bool offload_kqv;
|
||||
bool flash_attn;
|
||||
bool no_perf;
|
||||
|
||||
enum llama_pooling_type pooling_type;
|
||||
|
||||
@@ -6657,8 +6662,6 @@ static bool llm_load_tensors(
|
||||
bool use_mlock,
|
||||
llama_progress_callback progress_callback,
|
||||
void * progress_callback_user_data) {
|
||||
model.t_start_us = ggml_time_us();
|
||||
|
||||
auto & hparams = model.hparams;
|
||||
|
||||
model.split_mode = split_mode;
|
||||
@@ -8589,14 +8592,13 @@ static bool llm_load_tensors(
|
||||
}
|
||||
}
|
||||
|
||||
// loading time will be recalculate after the first eval, so
|
||||
// we take page faults deferred by mmap() into consideration
|
||||
model.t_load_us = ggml_time_us() - model.t_start_us;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
||||
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
|
||||
model.t_start_us = ggml_time_us();
|
||||
|
||||
try {
|
||||
llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
|
||||
|
||||
@@ -8658,6 +8660,10 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
||||
return -1;
|
||||
}
|
||||
|
||||
// loading time will be recalculate after the first eval, so
|
||||
// we take page faults deferred by mmap() into consideration
|
||||
model.t_load_us = ggml_time_us() - model.t_start_us;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -17945,6 +17951,7 @@ struct llama_context_params llama_context_default_params() {
|
||||
/*.embeddings =*/ false,
|
||||
/*.offload_kqv =*/ true,
|
||||
/*.flash_attn =*/ false,
|
||||
/*.no_perf =*/ true,
|
||||
/*.abort_callback =*/ nullptr,
|
||||
/*.abort_callback_data =*/ nullptr,
|
||||
};
|
||||
@@ -18155,6 +18162,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
cparams.embeddings = params.embeddings;
|
||||
cparams.offload_kqv = params.offload_kqv;
|
||||
cparams.flash_attn = params.flash_attn;
|
||||
cparams.no_perf = params.no_perf;
|
||||
cparams.pooling_type = params.pooling_type;
|
||||
|
||||
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
||||
@@ -20073,10 +20081,14 @@ void llama_synchronize(struct llama_context * ctx) {
|
||||
|
||||
// add the evaluation to the stats
|
||||
if (ctx->n_queued_tokens == 1) {
|
||||
ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
|
||||
if (!ctx->cparams.no_perf) {
|
||||
ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
|
||||
}
|
||||
ctx->n_eval++;
|
||||
} else if (ctx->n_queued_tokens > 1) {
|
||||
ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
|
||||
if (!ctx->cparams.no_perf) {
|
||||
ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
|
||||
}
|
||||
ctx->n_p_eval += ctx->n_queued_tokens;
|
||||
}
|
||||
|
||||
@@ -20684,65 +20696,40 @@ const char * llama_print_system_info(void) {
|
||||
return s.c_str();
|
||||
}
|
||||
|
||||
void llama_perf_print(const void * ctx, enum llama_perf_type type) {
|
||||
switch (type) {
|
||||
case LLAMA_PERF_TYPE_CONTEXT:
|
||||
{
|
||||
const auto * p = (const struct llama_context *) ctx;
|
||||
struct llama_perf_context_data llama_perf_context(const struct llama_context * ctx) {
|
||||
struct llama_perf_context_data data = {};
|
||||
|
||||
const double t_start_ms = 1e-3 * p->t_start_us;
|
||||
const double t_end_ms = 1.00 * ggml_time_ms();
|
||||
const double t_load_ms = 1e-3 * p->t_load_us;
|
||||
const double t_p_eval_ms = 1e-3 * p->t_p_eval_us;
|
||||
const double t_eval_ms = 1e-3 * p->t_eval_us;
|
||||
|
||||
const int32_t n_p_eval = std::max(0, p->n_p_eval);
|
||||
const int32_t n_eval = std::max(1, p->n_eval);
|
||||
|
||||
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, t_load_ms);
|
||||
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
__func__, t_p_eval_ms, n_p_eval, t_p_eval_ms / n_p_eval, 1e3 / t_p_eval_ms * n_p_eval);
|
||||
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
__func__, t_eval_ms, n_eval, t_eval_ms / n_eval, 1e3 / t_eval_ms * n_eval);
|
||||
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - t_start_ms), (n_p_eval + n_eval));
|
||||
} break;
|
||||
case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
|
||||
{
|
||||
const auto * smpl = (const struct llama_sampler *) ctx;
|
||||
const auto * p = (const struct llama_sampler_chain *) smpl->ctx;
|
||||
|
||||
const double t_sampler_ms = 1e-3 * p->t_sample_us;
|
||||
|
||||
const int32_t n_sampler = std::max(0, p->n_sample);
|
||||
|
||||
LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
__func__, t_sampler_ms, n_sampler, t_sampler_ms / n_sampler, 1e3 / t_sampler_ms * n_sampler);
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("invalid perf type");
|
||||
if (ctx == nullptr) {
|
||||
return data;
|
||||
}
|
||||
|
||||
data.t_start_ms = 1e-3 * ctx->t_start_us;
|
||||
data.t_load_ms = 1e-3 * ctx->t_load_us;
|
||||
data.t_p_eval_ms = 1e-3 * ctx->t_p_eval_us;
|
||||
data.t_eval_ms = 1e-3 * ctx->t_eval_us;
|
||||
data.n_p_eval = std::max(1, ctx->n_p_eval);
|
||||
data.n_eval = std::max(1, ctx->n_eval);
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
void llama_perf_reset(void * ctx, enum llama_perf_type type) {
|
||||
switch (type) {
|
||||
case LLAMA_PERF_TYPE_CONTEXT:
|
||||
{
|
||||
auto * p = (struct llama_context *) ctx;
|
||||
void llama_perf_context_print(const struct llama_context * ctx) {
|
||||
const auto data = llama_perf_context(ctx);
|
||||
|
||||
p->t_start_us = ggml_time_us();
|
||||
p->t_eval_us = p->n_eval = 0;
|
||||
p->t_p_eval_us = p->n_p_eval = 0;
|
||||
} break;
|
||||
case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
|
||||
{
|
||||
auto * smpl = (struct llama_sampler *) ctx;
|
||||
auto * p = (struct llama_sampler_chain *) smpl->ctx;
|
||||
const double t_end_ms = 1e-3 * ggml_time_us();
|
||||
|
||||
p->t_sample_us = p->n_sample = 0;
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("invalid perf type");
|
||||
}
|
||||
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, data.t_load_ms);
|
||||
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
__func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
|
||||
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
__func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
|
||||
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
|
||||
}
|
||||
|
||||
void llama_perf_context_reset(struct llama_context * ctx) {
|
||||
ctx->t_start_us = ggml_time_us();
|
||||
ctx->t_eval_us = ctx->n_eval = 0;
|
||||
ctx->t_p_eval_us = ctx->n_p_eval = 0;
|
||||
}
|
||||
|
||||
void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {
|
||||
|
||||
Reference in New Issue
Block a user