Compare commits

..

4 Commits

Author SHA1 Message Date
Andrei
1c5eba6f8e llama: Add attention and final logit soft-capping, update scaling factor to Gemma2 (#8197)
* Add attention and final logit softcapping.

* fix

* Add custom add_ functions

* Disable flash attention for Gemma2

* Update src/llama.cpp

Co-authored-by: slaren <slarengh@gmail.com>

* Add default value for attention and final logit softcap value

* Add custom kq scaling from Gemma2Attention

* Remove custom pre attention scaling and use computed value instead.

---------

Co-authored-by: slaren <slarengh@gmail.com>
2024-06-29 23:44:08 -04:00
Xuan Son Nguyen
72272b83a3 fix code typo in llama-cli (#8198) 2024-06-29 00:14:20 +02:00
Olivier Chafik
8748d8ac6f json: attempt to skip slow tests when running under emulator (#8189) 2024-06-28 18:02:05 +01:00
Xuan Son Nguyen
26a39bbd6b Add MiniCPM, Deepseek V2 chat template + clean up llama_chat_apply_template_internal (#8172)
* tmp_contains

* minicpm chat template

* add DeepSeek Lite template

* change deepseek-lite to deepseek2

* correct code comment

* correct code from master branch
2024-06-28 15:11:44 +02:00
9 changed files with 127 additions and 41 deletions

View File

@@ -799,6 +799,7 @@ jobs:
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
$sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
cd build
$env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
& $sde -future -- ctest -L main -C Release --verbose --timeout 900
- name: Determine tag name

View File

@@ -2363,6 +2363,12 @@ class Gemma2Model(Model):
self.gguf_writer.add_key_length(hparams["head_dim"])
self.gguf_writer.add_value_length(hparams["head_dim"])
self.gguf_writer.add_file_type(self.ftype)
self.gguf_writer.add_attn_logit_softcapping(
self.hparams["attn_logit_softcapping"]
)
self.gguf_writer.add_final_logit_softcapping(
self.hparams["final_logit_softcapping"]
)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unusem

View File

@@ -1390,7 +1390,7 @@ int main(int argc, char ** argv) {
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
}
if (t.n_gen > 0) {
test_gen(ctx, t.n_gen, 0, t.n_threads);
test_gen(ctx, 1, 0, t.n_threads);
}
for (int i = 0; i < params.reps; i++) {

View File

@@ -810,7 +810,7 @@ int main(int argc, char ** argv) {
is_antiprompt = true;
}
chat_add_and_format(model, chat_msgs, "system", assistant_ss.str());
chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
is_interacting = true;
printf("\n");
}

View File

@@ -50,6 +50,8 @@ class Keys:
POOLING_TYPE = "{arch}.pooling_type"
LOGIT_SCALE = "{arch}.logit_scale"
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping"
FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping"
class Attention:
HEAD_COUNT = "{arch}.attention.head_count"

View File

@@ -516,6 +516,12 @@ class GGUFWriter:
def add_logit_scale(self, value: float) -> None:
self.add_float32(Keys.LLM.LOGIT_SCALE.format(arch=self.arch), value)
def add_attn_logit_softcapping(self, value: float) -> None:
self.add_float32(Keys.LLM.ATTN_LOGIT_SOFTCAPPING.format(arch=self.arch), value)
def add_final_logit_softcapping(self, value: float) -> None:
self.add_float32(Keys.LLM.FINAL_LOGIT_SOFTCAPPING.format(arch=self.arch), value)
def add_expert_count(self, count: int) -> None:
self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count)

View File

@@ -302,6 +302,8 @@ enum llm_kv {
LLM_KV_POOLING_TYPE,
LLM_KV_LOGIT_SCALE,
LLM_KV_DECODER_START_TOKEN_ID,
LLM_KV_ATTN_LOGIT_SOFTCAPPING,
LLM_KV_FINAL_LOGIT_SOFTCAPPING,
LLM_KV_ATTENTION_HEAD_COUNT,
LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -392,6 +394,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_POOLING_TYPE , "%s.pooling_type" },
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
{ LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
{ LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -2099,6 +2103,9 @@ struct llama_hparams {
float f_norm_eps;
float f_norm_rms_eps;
float f_attn_logit_softcapping = 50.0f;
float f_final_logit_softcapping = 30.0f;
float rope_attn_factor = 1.0f;
float rope_freq_base_train;
float rope_freq_scale_train;
@@ -2115,8 +2122,9 @@ struct llama_hparams {
float f_max_alibi_bias = 0.0f;
float f_logit_scale = 0.0f;
bool causal_attn = true;
bool use_alibi = false;
bool causal_attn = true;
bool use_alibi = false;
bool attn_soft_cap = false;
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
@@ -4702,6 +4710,9 @@ static void llm_load_hparams(
case LLM_ARCH_GEMMA2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
hparams.attn_soft_cap = true;
switch (hparams.n_layer) {
case 42: model.type = e_model::MODEL_9B; break;
@@ -7579,6 +7590,12 @@ static struct ggml_tensor * llm_build_kqv(
kq = ggml_scale(ctx, kq, 30);
}
if (hparams.attn_soft_cap) {
kq = ggml_scale(ctx, kq, 1.0f / hparams.f_attn_logit_softcapping);
kq = ggml_tanh(ctx, kq);
kq = ggml_scale(ctx, kq, hparams.f_attn_logit_softcapping);
}
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
cb(kq, "kq_soft_max_ext", il);
@@ -11039,7 +11056,7 @@ struct llm_build_context {
ext_factor, attn_factor, beta_fast, beta_slow);
cb(Qcur, "Qcur", il);
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head)));
cb(Qcur, "Qcur_scaled", il);
Kcur = ggml_rope_ext(
@@ -11106,6 +11123,12 @@ struct llm_build_context {
// lm_head
cur = ggml_mul_mat(ctx0, model.output, cur);
// final logit soft-capping
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
cur = ggml_tanh(ctx0, cur);
cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur);
@@ -17379,6 +17402,12 @@ struct llama_context * llama_new_context_with_model(
params.flash_attn = false;
}
if (params.flash_attn && model->hparams.attn_soft_cap) {
LLAMA_LOG_WARN("%s: flash_attn is not compatible with attn_soft_cap - forcing off\n", __func__);
params.flash_attn = false;
}
if (params.flash_attn && model->hparams.n_embd_head_k != model->hparams.n_embd_head_v) {
LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k == n_embd_head_v - forcing off\n", __func__);
params.flash_attn = false;
@@ -19613,7 +19642,10 @@ static int32_t llama_chat_apply_template_internal(
std::string & dest, bool add_ass) {
// Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
std::stringstream ss;
if (tmpl == "chatml" || tmpl.find("<|im_start|>") != std::string::npos) {
auto tmpl_contains = [&tmpl](std::string haystack) -> bool {
return tmpl.find(haystack) != std::string::npos;
};
if (tmpl == "chatml" || tmpl_contains("<|im_start|>")) {
// chatml template
for (auto message : chat) {
ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
@@ -19621,16 +19653,16 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) {
ss << "<|im_start|>assistant\n";
}
} else if (tmpl == "llama2" || tmpl == "mistral" || tmpl.find("[INST]") != std::string::npos) {
} else if (tmpl == "llama2" || tmpl == "mistral" || tmpl_contains("[INST]")) {
// llama2 template and its variants
// [variant] support system message
bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos || tmpl == "mistral";
bool support_system_message = tmpl_contains("<<SYS>>") || tmpl == "mistral";
// [variant] space before + after response
bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos;
bool space_around_response = tmpl_contains("' ' + eos_token");
// [variant] add BOS inside history
bool add_bos_inside_history = tmpl.find("bos_token + '[INST]") != std::string::npos;
bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
// [variant] trim spaces from the input message
bool strip_message = tmpl.find("content.strip()") != std::string::npos;
bool strip_message = tmpl_contains("content.strip()");
// construct the prompt
bool is_inside_turn = true; // skip BOS at the beginning
ss << "[INST] ";
@@ -19656,7 +19688,7 @@ static int32_t llama_chat_apply_template_internal(
}
}
// llama2 templates seem to not care about "add_generation_prompt"
} else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos)) {
} else if (tmpl == "phi3" || (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>"))) {
// Phi 3
for (auto message : chat) {
std::string role(message->role);
@@ -19665,7 +19697,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) {
ss << "<|assistant|>\n";
}
} else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
} else if (tmpl == "zephyr" || tmpl_contains("<|user|>")) {
// zephyr template
for (auto message : chat) {
ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
@@ -19673,7 +19705,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) {
ss << "<|assistant|>\n";
}
} else if (tmpl == "monarch" || tmpl.find("bos_token + message['role']") != std::string::npos) {
} else if (tmpl == "monarch" || tmpl_contains("bos_token + message['role']")) {
// mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
for (auto message : chat) {
std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
@@ -19682,7 +19714,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) {
ss << "<s>assistant\n";
}
} else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl.find("<start_of_turn>") != std::string::npos) {
} else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl_contains("<start_of_turn>")) {
// google/gemma-7b-it
std::string system_prompt = "";
for (auto message : chat) {
@@ -19704,7 +19736,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) {
ss << "<start_of_turn>model\n";
}
} else if (tmpl == "orion" || tmpl.find("'\\n\\nAssistant: ' + eos_token") != std::string::npos) {
} else if (tmpl == "orion" || tmpl_contains("'\\n\\nAssistant: ' + eos_token")) {
// OrionStarAI/Orion-14B-Chat
std::string system_prompt = "";
for (auto message : chat) {
@@ -19724,7 +19756,7 @@ static int32_t llama_chat_apply_template_internal(
ss << message->content << "</s>";
}
}
} else if (tmpl == "openchat" || tmpl.find("GPT4 Correct ") != std::string::npos) {
} else if (tmpl == "openchat" || tmpl_contains("GPT4 Correct ")) {
// openchat/openchat-3.5-0106,
for (auto message : chat) {
std::string role(message->role);
@@ -19738,13 +19770,13 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) {
ss << "GPT4 Correct Assistant:";
}
} else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl.find("USER: ") != std::string::npos && tmpl.find("ASSISTANT: ") != std::string::npos)) {
} else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: "))) {
// eachadea/vicuna-13b-1.1 (and Orca variant)
for (auto message : chat) {
std::string role(message->role);
if (role == "system") {
// Orca-Vicuna variant uses a system prefix
if (tmpl == "vicuna-orca" || tmpl.find("SYSTEM: ") != std::string::npos) {
if (tmpl == "vicuna-orca" || tmpl_contains("SYSTEM: ")) {
ss << "SYSTEM: " << message->content << "\n";
} else {
ss << message->content << "\n\n";
@@ -19758,7 +19790,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) {
ss << "ASSISTANT:";
}
} else if (tmpl == "deepseek" || (tmpl.find("### Instruction:") != std::string::npos && tmpl.find("<|EOT|>") != std::string::npos)) {
} else if (tmpl == "deepseek" || (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>"))) {
// deepseek-ai/deepseek-coder-33b-instruct
for (auto message : chat) {
std::string role(message->role);
@@ -19773,7 +19805,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) {
ss << "### Response:\n";
}
} else if (tmpl == "command-r" || (tmpl.find("<|START_OF_TURN_TOKEN|>") != std::string::npos && tmpl.find("<|USER_TOKEN|>") != std::string::npos)) {
} else if (tmpl == "command-r" || (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>"))) {
// CohereForAI/c4ai-command-r-plus
for (auto message : chat) {
std::string role(message->role);
@@ -19788,7 +19820,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) {
ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
}
} else if (tmpl == "llama3" || (tmpl.find("<|start_header_id|>") != std::string::npos && tmpl.find("<|end_header_id|>") != std::string::npos)) {
} else if (tmpl == "llama3" || (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>"))) {
// Llama 3
for (auto message : chat) {
std::string role(message->role);
@@ -19797,6 +19829,33 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) {
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
}
} else if (tmpl == "minicpm" || tmpl_contains(u8"<用户>")) {
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
for (auto message : chat) {
std::string role(message->role);
if (role == "user") {
ss << u8"<用户>";
ss << trim(message->content);
ss << "<AI>";
} else {
ss << trim(message->content);
}
}
} else if (tmpl == "deepseek2" || tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
// DeepSeek-V2
for (auto message : chat) {
std::string role(message->role);
if (role == "system") {
ss << message->content << "\n\n";
} else if (role == "user") {
ss << "User: " << message->content << "\n\n";
} else if (role == "assistant") {
ss << "Assistant: " << message->content << u8"<end▁of▁sentence>";
}
}
if (add_ass) {
ss << "Assistant:";
}
} else {
// template not supported
return -1;

View File

@@ -57,7 +57,11 @@ int main(void) {
//Phi-3-medium
"{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
//Phi-3-vision
"{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}"
"{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}",
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
u8"{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + '<AI>'}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}",
// DeepSeek-V2
"{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
};
std::vector<std::string> expected_output = {
// teknium/OpenHermes-2.5-Mistral-7B
@@ -94,6 +98,10 @@ int main(void) {
"<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n I am an assistant <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
//Phi-3-vision
"<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n I am an assistant <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
u8"You are a helpful assistant<用户>Hello<AI>Hi there<用户>Who are you<AI>I am an assistant<用户>Another question<AI>",
// DeepSeek-V2
u8"You are a helpful assistant\n\nUser: Hello\n\nAssistant: Hi there<end▁of▁sentence>User: Who are you\n\nAssistant: I am an assistant <end▁of▁sentence>User: Another question\n\nAssistant:",
};
std::vector<char> formatted_chat(1024);
int32_t res;

View File

@@ -1239,26 +1239,30 @@ int main() {
}
});
if (getenv("LLAMA_PYTHON_AVAILABLE") || (std::system("python -c \"import sys; exit(1) if sys.version_info < (3, 8) else print('Python version is sufficient')\"") == 0)) {
test_all("Python", [](const TestCase & tc) {
write("test-json-schema-input.tmp", tc.schema);
tc.verify_status(std::system(
"python ./examples/json_schema_to_grammar.py test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
tc.verify(read("test-grammar-output.tmp"));
});
if (getenv("LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR")) {
fprintf(stderr, "\033[33mWARNING: Skipping slow tests on emulator.\n\033[0m");
} else {
fprintf(stderr, "\033[33mWARNING: Python not found (min version required is 3.8), skipping Python JSON schema -> grammar tests.\n\033[0m");
}
if (getenv("LLAMA_PYTHON_AVAILABLE") || (std::system("python -c \"import sys; exit(1) if sys.version_info < (3, 8) else print('Python version is sufficient')\"") == 0)) {
test_all("Python", [](const TestCase & tc) {
write("test-json-schema-input.tmp", tc.schema);
tc.verify_status(std::system(
"python ./examples/json_schema_to_grammar.py test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
tc.verify(read("test-grammar-output.tmp"));
});
} else {
fprintf(stderr, "\033[33mWARNING: Python not found (min version required is 3.8), skipping Python JSON schema -> grammar tests.\n\033[0m");
}
if (getenv("LLAMA_NODE_AVAILABLE") || (std::system("node --version") == 0)) {
test_all("JavaScript", [](const TestCase & tc) {
write("test-json-schema-input.tmp", tc.schema);
tc.verify_status(std::system(
"node ./tests/run-json-schema-to-grammar.mjs test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
tc.verify(read("test-grammar-output.tmp"));
});
} else {
fprintf(stderr, "\033[33mWARNING: Node not found, skipping JavaScript JSON schema -> grammar tests.\n\033[0m");
if (getenv("LLAMA_NODE_AVAILABLE") || (std::system("node --version") == 0)) {
test_all("JavaScript", [](const TestCase & tc) {
write("test-json-schema-input.tmp", tc.schema);
tc.verify_status(std::system(
"node ./tests/run-json-schema-to-grammar.mjs test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
tc.verify(read("test-grammar-output.tmp"));
});
} else {
fprintf(stderr, "\033[33mWARNING: Node not found, skipping JavaScript JSON schema -> grammar tests.\n\033[0m");
}
}
test_all("Check Expectations Validity", [](const TestCase & tc) {