mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-26 14:23:22 +02:00
Compare commits
23 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
94b0200a01 | ||
|
|
b908baf182 | ||
|
|
492bc31978 | ||
|
|
77d6ae4ac8 | ||
|
|
10b26ee23a | ||
|
|
3dadc88b58 | ||
|
|
39e4b1dc9b | ||
|
|
11c325c6e0 | ||
|
|
237958db33 | ||
|
|
abb9f3c42b | ||
|
|
da348c9dfb | ||
|
|
e6267a9359 | ||
|
|
2bf318fd2f | ||
|
|
c78e682245 | ||
|
|
c5897995a7 | ||
|
|
03fd9d3bb4 | ||
|
|
8004f3a8d1 | ||
|
|
eacb4b67a2 | ||
|
|
c0d0430340 | ||
|
|
3bb2fcc856 | ||
|
|
27326bfce1 | ||
|
|
ad9f692f8f | ||
|
|
8a70973557 |
@@ -893,23 +893,6 @@ static void common_chat_parse_minimax_m2(common_chat_msg_parser & builder) {
|
||||
builder.consume_reasoning_with_xml_tool_calls(form, "<think>", "</think>");
|
||||
}
|
||||
|
||||
static void common_chat_parse_qwen3_coder_xml(common_chat_msg_parser & builder) {
|
||||
static const xml_tool_call_format form = ([]() {
|
||||
xml_tool_call_format form {};
|
||||
form.scope_start = "<tool_call>";
|
||||
form.tool_start = "<function=";
|
||||
form.tool_sep = ">";
|
||||
form.key_start = "<parameter=";
|
||||
form.key_val_sep = ">";
|
||||
form.val_end = "</parameter>";
|
||||
form.tool_end = "</function>";
|
||||
form.scope_end = "</tool_call>";
|
||||
form.trim_raw_argval = true;
|
||||
return form;
|
||||
})();
|
||||
builder.consume_reasoning_with_xml_tool_calls(form);
|
||||
}
|
||||
|
||||
static void common_chat_parse_kimi_k2(common_chat_msg_parser & builder) {
|
||||
static const xml_tool_call_format form = ([]() {
|
||||
xml_tool_call_format form {};
|
||||
@@ -1590,9 +1573,6 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
|
||||
case COMMON_CHAT_FORMAT_KIMI_K2:
|
||||
common_chat_parse_kimi_k2(builder);
|
||||
break;
|
||||
case COMMON_CHAT_FORMAT_QWEN3_CODER_XML:
|
||||
common_chat_parse_qwen3_coder_xml(builder);
|
||||
break;
|
||||
case COMMON_CHAT_FORMAT_APRIEL_1_5:
|
||||
common_chat_parse_apriel_1_5(builder);
|
||||
break;
|
||||
|
||||
@@ -65,14 +65,25 @@ json common_chat_msg::to_json_oaicompat(bool concat_typed_text) const {
|
||||
} else if (!content_parts.empty()) {
|
||||
if (concat_typed_text) {
|
||||
std::string text;
|
||||
bool last_was_media_marker = false;
|
||||
// join parts with newline, do not add newline before or after media markers
|
||||
for (const auto & part : content_parts) {
|
||||
if (part.type != "text") {
|
||||
bool add_new_line = true;
|
||||
if (part.type == "text") {
|
||||
add_new_line = !last_was_media_marker && !text.empty();
|
||||
last_was_media_marker = false;
|
||||
} else if (part.type == "media_marker") {
|
||||
add_new_line = false;
|
||||
last_was_media_marker = true;
|
||||
} else {
|
||||
LOG_WRN("Ignoring content part type: %s\n", part.type.c_str());
|
||||
continue;
|
||||
}
|
||||
if (!text.empty()) {
|
||||
|
||||
if (add_new_line) {
|
||||
text += '\n';
|
||||
}
|
||||
|
||||
text += part.text;
|
||||
}
|
||||
jmsg["content"] = text;
|
||||
@@ -319,7 +330,7 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
|
||||
throw std::invalid_argument("Missing content part type: " + part.dump());
|
||||
}
|
||||
const auto & type = part.at("type");
|
||||
if (type != "text") {
|
||||
if (type != "text" && type != "media_marker") {
|
||||
throw std::invalid_argument("Unsupported content part type: " + type.dump());
|
||||
}
|
||||
common_chat_msg_content_part msg_part;
|
||||
@@ -725,7 +736,6 @@ const char * common_chat_format_name(common_chat_format format) {
|
||||
case COMMON_CHAT_FORMAT_MINIMAX_M2: return "MiniMax-M2";
|
||||
case COMMON_CHAT_FORMAT_GLM_4_5: return "GLM 4.5";
|
||||
case COMMON_CHAT_FORMAT_KIMI_K2: return "Kimi K2";
|
||||
case COMMON_CHAT_FORMAT_QWEN3_CODER_XML: return "Qwen3 Coder";
|
||||
case COMMON_CHAT_FORMAT_APRIEL_1_5: return "Apriel 1.5";
|
||||
case COMMON_CHAT_FORMAT_XIAOMI_MIMO: return "Xiaomi MiMo";
|
||||
case COMMON_CHAT_FORMAT_SOLAR_OPEN: return "Solar Open";
|
||||
@@ -1511,14 +1521,17 @@ static common_chat_params common_chat_params_init_nemotron_v2(const common_chat_
|
||||
return data;
|
||||
}
|
||||
|
||||
static common_chat_params common_chat_params_init_nemotron_v3(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
||||
static common_chat_params common_chat_params_init_qwen3_coder(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
||||
common_chat_params data;
|
||||
|
||||
data.prompt = apply(tmpl, inputs);
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_CONSTRUCTED;
|
||||
|
||||
// Nemotron Nano 3 and Step-3.5-Flash use the Qwen3 Coder tool calling with thinking
|
||||
bool supports_reasoning = (tmpl.source().find("<think>") != std::string::npos);
|
||||
|
||||
// Handle thinking tags appropriately based on inputs.enable_thinking
|
||||
if (string_ends_with(data.prompt, "<think>\n")) {
|
||||
if (supports_reasoning && string_ends_with(data.prompt, "<think>\n")) {
|
||||
if (!inputs.enable_thinking) {
|
||||
data.prompt += "</think>";
|
||||
} else {
|
||||
@@ -1527,19 +1540,21 @@ static common_chat_params common_chat_params_init_nemotron_v3(const common_chat_
|
||||
}
|
||||
|
||||
data.preserved_tokens = {
|
||||
"<think>",
|
||||
"</think>",
|
||||
"<tool_call>",
|
||||
"</tool_call>",
|
||||
};
|
||||
|
||||
if (supports_reasoning) {
|
||||
data.preserved_tokens.insert(data.preserved_tokens.end(), {"<think>", "</think>"});
|
||||
}
|
||||
|
||||
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
|
||||
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
|
||||
auto include_grammar = true;
|
||||
|
||||
auto parser = build_chat_peg_constructed_parser([&](auto & p) {
|
||||
auto reasoning = p.eps();
|
||||
if (inputs.enable_thinking && extract_reasoning) {
|
||||
if (supports_reasoning && inputs.enable_thinking && extract_reasoning) {
|
||||
auto reasoning_content = p.reasoning(p.until("</think>")) + ("</think>" | p.end());
|
||||
if (data.thinking_forced_open) {
|
||||
reasoning = reasoning_content;
|
||||
@@ -1877,38 +1892,6 @@ static common_chat_params common_chat_params_init_minimax_m2(const common_chat_t
|
||||
return data;
|
||||
}
|
||||
|
||||
static common_chat_params common_chat_params_init_qwen3_coder_xml(const common_chat_template & tmpl, const struct templates_params & params) {
|
||||
common_chat_params data;
|
||||
data.grammar_lazy = params.tools.is_array() && !params.tools.empty() && params.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||
|
||||
data.prompt = apply(tmpl, params);
|
||||
data.format = COMMON_CHAT_FORMAT_QWEN3_CODER_XML;
|
||||
|
||||
data.preserved_tokens = {
|
||||
"<tool_call>",
|
||||
"</tool_call>",
|
||||
"<function=",
|
||||
"</function>",
|
||||
"<parameter=",
|
||||
"</parameter>",
|
||||
};
|
||||
|
||||
// build grammar for tool call
|
||||
static const xml_tool_call_format form {
|
||||
/* form.scope_start = */ "<tool_call>\n",
|
||||
/* form.tool_start = */ "<function=",
|
||||
/* form.tool_sep = */ ">\n",
|
||||
/* form.key_start = */ "<parameter=",
|
||||
/* form.key_val_sep = */ ">\n",
|
||||
/* form.val_end = */ "\n</parameter>\n",
|
||||
/* form.tool_end = */ "</function>\n",
|
||||
/* form.scope_end = */ "</tool_call>",
|
||||
};
|
||||
build_grammar_xml_tool_call(data, params.tools, form);
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
static common_chat_params common_chat_params_init_kimi_k2(const common_chat_template & tmpl, const struct templates_params & params) {
|
||||
common_chat_params data;
|
||||
data.grammar_lazy = params.tools.is_array() && !params.tools.empty() && params.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||
@@ -2032,6 +2015,7 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
|
||||
if (has_reasoning_content && has_tool_calls) {
|
||||
auto adjusted_message = msg;
|
||||
adjusted_message["thinking"] = msg.at("reasoning_content");
|
||||
adjusted_message.erase("content");
|
||||
adjusted_messages.push_back(adjusted_message);
|
||||
} else {
|
||||
adjusted_messages.push_back(msg);
|
||||
@@ -3129,19 +3113,13 @@ static common_chat_params common_chat_templates_apply_jinja(
|
||||
}
|
||||
|
||||
// Qwen3-Coder XML format detection (must come before Hermes 2 Pro)
|
||||
// Detect via explicit XML markers unique to Qwen3-Coder to avoid false positives in other templates.
|
||||
// Require presence of <tool_call>, <function=...>, and <parameter=...> blocks.
|
||||
// Detect via XML markers: <tool_call>, <function=...>, and <parameter=...> blocks.
|
||||
// Also matches Step-3.5-Flash and Nemotron 3 Nano which use the same output format.
|
||||
if (src.find("<tool_call>") != std::string::npos &&
|
||||
src.find("<function>") != std::string::npos &&
|
||||
src.find("<function=") != std::string::npos &&
|
||||
src.find("<parameters>") != std::string::npos &&
|
||||
src.find("<parameter=") != std::string::npos) {
|
||||
workaround::func_args_not_string(params.messages);
|
||||
// Nemotron 3 Nano 30B A3B
|
||||
if (src.find("<think>") != std::string::npos) {
|
||||
return common_chat_params_init_nemotron_v3(tmpl, params);
|
||||
}
|
||||
return common_chat_params_init_qwen3_coder_xml(tmpl, params);
|
||||
return common_chat_params_init_qwen3_coder(tmpl, params);
|
||||
}
|
||||
|
||||
// Xiaomi MiMo format detection (must come before Hermes 2 Pro)
|
||||
@@ -3307,7 +3285,7 @@ static common_chat_params common_chat_templates_apply_legacy(
|
||||
for (const auto & msg : inputs.messages) {
|
||||
auto content = msg.content;
|
||||
for (const auto & part : msg.content_parts) {
|
||||
if (part.type != "text") {
|
||||
if (part.type != "text" && part.type != "media_marker") {
|
||||
LOG_WRN("Ignoring non-text content part: %s\n", part.type.c_str());
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -128,7 +128,6 @@ enum common_chat_format {
|
||||
COMMON_CHAT_FORMAT_GLM_4_5,
|
||||
COMMON_CHAT_FORMAT_MINIMAX_M2,
|
||||
COMMON_CHAT_FORMAT_KIMI_K2,
|
||||
COMMON_CHAT_FORMAT_QWEN3_CODER_XML,
|
||||
COMMON_CHAT_FORMAT_APRIEL_1_5,
|
||||
COMMON_CHAT_FORMAT_XIAOMI_MIMO,
|
||||
COMMON_CHAT_FORMAT_SOLAR_OPEN,
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
// for converting from JSON to jinja values
|
||||
#include <nlohmann/json.hpp>
|
||||
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <cctype>
|
||||
#include <vector>
|
||||
@@ -715,8 +716,46 @@ const func_builtins & value_string_t::get_builtins() const {
|
||||
return args.get_pos(0);
|
||||
}},
|
||||
{"tojson", tojson},
|
||||
{"indent", [](const func_args &) -> value {
|
||||
throw not_implemented_exception("String indent builtin not implemented");
|
||||
{"indent", [](const func_args &args) -> value {
|
||||
args.ensure_count(1, 4);
|
||||
value val_input = args.get_pos(0);
|
||||
value val_width = args.get_kwarg_or_pos("width", 1);
|
||||
const bool first = args.get_kwarg_or_pos("first", 2)->as_bool(); // undefined == false
|
||||
const bool blank = args.get_kwarg_or_pos("blank", 3)->as_bool(); // undefined == false
|
||||
if (!is_val<value_string>(val_input)) {
|
||||
throw raised_exception("indent() first argument must be a string");
|
||||
}
|
||||
std::string indent;
|
||||
if (is_val<value_int>(val_width)) {
|
||||
indent.assign(val_width->as_int(), ' ');
|
||||
} else if (is_val<value_string>(val_width)) {
|
||||
indent = val_width->as_string().str();
|
||||
} else {
|
||||
indent = " ";
|
||||
}
|
||||
std::string indented;
|
||||
std::string input = val_input->as_string().str();
|
||||
std::istringstream iss = std::istringstream(input);
|
||||
std::string line;
|
||||
while (std::getline(iss, line)) {
|
||||
if (!indented.empty()) {
|
||||
indented.push_back('\n');
|
||||
}
|
||||
if ((indented.empty() ? first : (!line.empty() || blank))) {
|
||||
indented += indent;
|
||||
}
|
||||
indented += line;
|
||||
}
|
||||
if (!input.empty() && input.back() == '\n') {
|
||||
indented.push_back('\n');
|
||||
if (blank) {
|
||||
indented += indent;
|
||||
}
|
||||
}
|
||||
|
||||
auto res = mk_val<value_string>(indented);
|
||||
res->val_str.mark_input_based_on(val_input->as_string());
|
||||
return res;
|
||||
}},
|
||||
{"join", [](const func_args &) -> value {
|
||||
throw not_implemented_exception("String join builtin not implemented");
|
||||
|
||||
@@ -1163,6 +1163,9 @@ class TextModel(ModelBase):
|
||||
if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901":
|
||||
# ref: https://huggingface.co/core42/jais-13b
|
||||
res = "jais"
|
||||
if chkhsh == "bc5108ee1eb6a3d600cadd065f63190fbd0554dbc9e4bbd6a0d977970afc8d2a":
|
||||
# ref: https://huggingface.co/inceptionai/Jais-2-8B-Chat
|
||||
res = "jais-2"
|
||||
if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f":
|
||||
# ref: https://huggingface.co/WisdomShell/CodeShell-7B
|
||||
res = "codeshell"
|
||||
@@ -3730,6 +3733,13 @@ class Ernie4_5Model(TextModel):
|
||||
def set_vocab(self):
|
||||
self._set_vocab_sentencepiece()
|
||||
|
||||
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
||||
if tokenizer_config_file.is_file():
|
||||
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
||||
tokenizer_config_json = json.load(f)
|
||||
if "add_prefix_space" in tokenizer_config_json:
|
||||
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
|
||||
@@ -3739,6 +3749,10 @@ class Ernie4_5Model(TextModel):
|
||||
if (head_dim := self.hparams.get("head_dim")) is None:
|
||||
head_dim = self.hparams["hidden_size"] // num_heads
|
||||
|
||||
if "mlp_AR" in name or "vision_model" in name:
|
||||
# skip vision model and projector tensors
|
||||
return
|
||||
|
||||
if "ernie." in name:
|
||||
name = name.replace("ernie.", "model.")
|
||||
# split the qkv weights
|
||||
@@ -3848,6 +3862,48 @@ class Ernie4_5MoeModel(Ernie4_5Model):
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("PaddleOCRVLForConditionalGeneration")
|
||||
class PaddleOCRModel(Ernie4_5Model):
|
||||
model_arch = gguf.MODEL_ARCH.PADDLEOCR
|
||||
|
||||
|
||||
@ModelBase.register("PaddleOCRVisionModel")
|
||||
class PaddleOCRVisionModel(MmprojModel):
|
||||
# PaddleOCR-VL uses a modified version of Siglip
|
||||
min_pixels: int = 0
|
||||
max_pixels: int = 0
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
assert self.hparams_vision is not None
|
||||
self.min_pixels = self.preprocessor_config["min_pixels"]
|
||||
self.max_pixels = self.preprocessor_config["max_pixels"]
|
||||
self.hparams_vision["image_size"] = int(math.sqrt(self.max_pixels))
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
assert self.hparams_vision is not None
|
||||
hparams = self.hparams_vision
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PADDLEOCR)
|
||||
self.gguf_writer.add_vision_max_pixels(self.max_pixels)
|
||||
self.gguf_writer.add_vision_min_pixels(self.min_pixels)
|
||||
self.gguf_writer.add_vision_use_gelu(True)
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("rms_norm_eps", 1e-6))
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
name = name.replace("visual.", "model.")
|
||||
|
||||
if "vision_model" in name or "mlp_AR" in name:
|
||||
if "packing_position_embedding" in name:
|
||||
return # unused
|
||||
elif "vision_model.head" in name:
|
||||
# we don't yet support image embeddings for this model
|
||||
return
|
||||
else:
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
return # skip other tensors
|
||||
|
||||
|
||||
@ModelBase.register(
|
||||
"Qwen2VLModel",
|
||||
"Qwen2VLForConditionalGeneration",
|
||||
@@ -8633,6 +8689,17 @@ class T5EncoderModel(TextModel):
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Jais2ForCausalLM")
|
||||
class Jais2Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.JAIS2
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
head_dim = hparams.get("head_dim", hparams["hidden_size"] // hparams["num_attention_heads"])
|
||||
self.gguf_writer.add_rope_dimension_count(head_dim)
|
||||
|
||||
|
||||
@ModelBase.register("JAISLMHeadModel")
|
||||
class JaisModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.JAIS
|
||||
@@ -10726,7 +10793,7 @@ class LFM2Model(TextModel):
|
||||
def set_gguf_parameters(self):
|
||||
# set num_key_value_heads only for attention layers
|
||||
self.hparams["num_key_value_heads"] = [
|
||||
self.hparams["num_key_value_heads"] if layer_type == "full_attention" else 0
|
||||
self.hparams["num_key_value_heads"] if layer_type != "conv" else 0
|
||||
for layer_type in self.hparams["layer_types"]
|
||||
]
|
||||
|
||||
@@ -10912,6 +10979,28 @@ class LFM2AudioModel(ConformerAudioModel):
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Lfm25AudioTokenizer")
|
||||
class LFM25AudioTokenizer(LFM2Model):
|
||||
model_arch = gguf.MODEL_ARCH.LFM2
|
||||
|
||||
def set_vocab(self):
|
||||
self._set_vocab_none()
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
|
||||
self.gguf_writer.add_embedding_length_out(self.hparams["output_size"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if name == "istft.window" or name.startswith("emb.emb"):
|
||||
return
|
||||
|
||||
if name.startswith("lin"):
|
||||
name = name.replace("lin", "dense_2_out")
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("SmallThinkerForCausalLM")
|
||||
class SmallThinkerModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.SMALLTHINKER
|
||||
@@ -11003,13 +11092,17 @@ class ModernBertModel(BertModel):
|
||||
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# these layers act as MLM head, so we don't need them
|
||||
if name.startswith("decoder."):
|
||||
return
|
||||
|
||||
if name.startswith("model."):
|
||||
name = name[6:]
|
||||
|
||||
if self.cls_out_labels:
|
||||
# For BertForSequenceClassification (direct projection layer)
|
||||
if name == "classifier.weight":
|
||||
name = "classifier.out_proj.weight"
|
||||
|
||||
if name == "classifier.bias":
|
||||
name = "classifier.out_proj.bias"
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
|
||||
@@ -114,6 +114,7 @@ models = [
|
||||
{"name": "gemma", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", },
|
||||
{"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
|
||||
{"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
|
||||
{"name": "jais-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inceptionai/Jais-2-8B-Chat", },
|
||||
{"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
|
||||
{"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
|
||||
{"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
|
||||
|
||||
@@ -31,7 +31,7 @@ Legend:
|
||||
| CONV_3D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
| COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||
| COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
|
||||
| CROSS_ENTROPY_LOSS | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
@@ -96,13 +96,13 @@ Legend:
|
||||
| SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
| SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||
| SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
||||
| SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
||||
| SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||
| SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||
| SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| SSM_CONV | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
||||
| STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
|
||||
@@ -8760,22 +8760,14 @@
|
||||
"WebGPU: WebGPU","ADD_ID","type_a=f32,type_b=f32,n_embd=129,n_experts=8,n_experts_used=4,n_token=1","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","ADD_ID","type_a=f32,type_b=f32,n_embd=129,n_experts=8,n_experts_used=4,n_token=32","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","ADD_ID","type_a=f32,type_b=f32,n_embd=129,n_experts=8,n_experts_used=4,n_token=129","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","SQR","type=f16,ne=[10,5,4,3]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","SQRT","type=f16,ne=[10,3,3,2]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","LOG","type=f16,ne=[10,5,4,3]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","SIN","type=f16,ne=[10,2,2,2]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","COS","type=f16,ne=[10,2,2,2]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","CLAMP","type=f16,ne=[10,5,4,3],min=-0.500000,max=0.500000","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","LEAKY_RELU","type=f16,ne_a=[10,5,4,3],negative_slope=0.100000","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","FLOOR","type=f16,ne=[10,2,2,2]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","CEIL","type=f16,ne=[10,2,2,2]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","ROUND","type=f16,ne=[10,2,2,2]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","TRUNC","type=f16,ne=[10,2,2,2]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","SQR","type=f16,ne=[7,1,5,3]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","SQRT","type=f16,ne=[7,1,5,3]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","LOG","type=f16,ne=[7,1,5,3]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","SIN","type=f16,ne=[7,1,5,3]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","COS","type=f16,ne=[7,1,5,3]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","CLAMP","type=f16,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","LEAKY_RELU","type=f16,ne_a=[7,1,5,3],negative_slope=0.100000","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","FLOOR","type=f16,ne=[7,1,5,3]","support","1","yes","WebGPU"
|
||||
@@ -8786,22 +8778,14 @@
|
||||
"WebGPU: WebGPU","ROUND","type=f16,ne=[1024,1024,1,1]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","TRUNC","type=f16,ne=[7,1,5,3]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","TRUNC","type=f16,ne=[1024,1024,1,1]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","SQR","type=f32,ne=[10,5,4,3]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","SQRT","type=f32,ne=[10,3,3,2]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","LOG","type=f32,ne=[10,5,4,3]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","SIN","type=f32,ne=[10,2,2,2]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","COS","type=f32,ne=[10,2,2,2]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","CLAMP","type=f32,ne=[10,5,4,3],min=-0.500000,max=0.500000","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","LEAKY_RELU","type=f32,ne_a=[10,5,4,3],negative_slope=0.100000","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","FLOOR","type=f32,ne=[10,2,2,2]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","CEIL","type=f32,ne=[10,2,2,2]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","ROUND","type=f32,ne=[10,2,2,2]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","TRUNC","type=f32,ne=[10,2,2,2]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","SQR","type=f32,ne=[7,1,5,3]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","SQRT","type=f32,ne=[7,1,5,3]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","LOG","type=f32,ne=[7,1,5,3]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","SIN","type=f32,ne=[7,1,5,3]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","COS","type=f32,ne=[7,1,5,3]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","CLAMP","type=f32,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","LEAKY_RELU","type=f32,ne_a=[7,1,5,3],negative_slope=0.100000","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","FLOOR","type=f32,ne=[7,1,5,3]","support","1","yes","WebGPU"
|
||||
@@ -18901,3 +18885,27 @@
|
||||
"WebGPU: WebGPU","CROSS_ENTROPY_LOSS_BACK","type=f32,ne=[30000,1,1,1]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","OPT_STEP_ADAMW","type=f32,ne=[10,5,4,3]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","OPT_STEP_SGD","type=f32,ne=[10,5,4,3]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","SQR","type=f16,ne=[10,5,4,3]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","SQRT","type=f16,ne=[10,3,3,2]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","SIN","type=f16,ne=[10,2,2,2]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","COS","type=f16,ne=[10,2,2,2]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","SQR","type=f16,ne=[7,1,5,3]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","SQR","type=f16,ne=[1024,1024,1,1]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","SQRT","type=f16,ne=[7,1,5,3]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","SQRT","type=f16,ne=[1024,1024,1,1]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","SIN","type=f16,ne=[7,1,5,3]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","SIN","type=f16,ne=[1024,1024,1,1]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","COS","type=f16,ne=[7,1,5,3]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","COS","type=f16,ne=[1024,1024,1,1]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","SQR","type=f32,ne=[10,5,4,3]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","SQRT","type=f32,ne=[10,3,3,2]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","SIN","type=f32,ne=[10,2,2,2]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","COS","type=f32,ne=[10,2,2,2]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","SQR","type=f32,ne=[7,1,5,3]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","SQR","type=f32,ne=[1024,1024,1,1]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","SQRT","type=f32,ne=[7,1,5,3]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","SQRT","type=f32,ne=[1024,1024,1,1]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","SIN","type=f32,ne=[7,1,5,3]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","SIN","type=f32,ne=[1024,1024,1,1]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","COS","type=f32,ne=[7,1,5,3]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","COS","type=f32,ne=[1024,1024,1,1]","support","1","yes","WebGPU"
|
||||
|
||||
|
Can't render this file because it is too large.
|
@@ -171,15 +171,9 @@
|
||||
#elif defined(__riscv)
|
||||
// quants.c
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||
#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
|
||||
#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
|
||||
#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
|
||||
#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
|
||||
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
|
||||
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
|
||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
|
||||
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
|
||||
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
|
||||
|
||||
@@ -1954,3 +1954,773 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
#endif
|
||||
}
|
||||
|
||||
static const uint8_t sign_gather_indices_arr[64] = {
|
||||
0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,
|
||||
4,4,4,4,4,4,4,4, 5,5,5,5,5,5,5,5, 6,6,6,6,6,6,6,6, 7,7,7,7,7,7,7,7
|
||||
};
|
||||
|
||||
static const uint8_t sign_bit_masks_arr[64] = {
|
||||
1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128,
|
||||
1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128
|
||||
};
|
||||
|
||||
static void ggml_vec_dot_iq2_s_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(n % QK_K == 0);
|
||||
UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs);
|
||||
|
||||
const block_iq2_s * GGML_RESTRICT x = vx;
|
||||
const block_q8_K * GGML_RESTRICT y = vy;
|
||||
|
||||
const int nb = n / QK_K;
|
||||
const uint64_t * grid64 = (const uint64_t *)iq2s_grid;
|
||||
|
||||
// --- Pre-load Constants ---
|
||||
uint16_t gather_qh_arr[8] = {0, 0, 0, 0, 1, 1, 1, 1};
|
||||
vuint16mf2_t v_gather_qh = __riscv_vle16_v_u16mf2(gather_qh_arr, 8);
|
||||
uint16_t shift_qh_arr[8] = {11, 9, 7, 5, 11, 9, 7, 5};
|
||||
vuint16mf2_t v_shift_qh = __riscv_vle16_v_u16mf2(shift_qh_arr, 8);
|
||||
|
||||
// Constants for sign extraction
|
||||
vuint8m2_t v_sign_gather_indices = __riscv_vle8_v_u8m2(sign_gather_indices_arr, 64);
|
||||
vuint8m2_t v_sign_masks = __riscv_vle8_v_u8m2(sign_bit_masks_arr, 64);
|
||||
|
||||
float sumf = 0.0f;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float combined_scale = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
|
||||
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const uint8_t * GGML_RESTRICT scales = x[i].scales;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
|
||||
const uint8_t * signs_ptr = qs + 32;
|
||||
|
||||
float sum_block = 0.0f;
|
||||
|
||||
for (int ib = 0; ib < 4; ++ib) {
|
||||
// Combine low + high bits
|
||||
vuint8mf4_t v_qs_u8 = __riscv_vle8_v_u8mf4(qs, 8);
|
||||
qs += 8;
|
||||
uint16_t qh_val;
|
||||
memcpy(&qh_val, qh, 2);
|
||||
qh += 2;
|
||||
vuint8mf8_t v_qh_raw = __riscv_vle8_v_u8mf8((const uint8_t*)&qh_val, 2);
|
||||
vuint16mf4_t v_qh_u16 = __riscv_vwcvtu_x_x_v_u16mf4(v_qh_raw, 2);
|
||||
vuint16mf2_t v_qh_u16_ext = __riscv_vlmul_ext_v_u16mf4_u16mf2(v_qh_u16);
|
||||
vuint16mf2_t v_qh_expanded = __riscv_vrgather_vv_u16mf2(v_qh_u16_ext, v_gather_qh, 8);
|
||||
v_qh_expanded = __riscv_vsll_vv_u16mf2(v_qh_expanded, v_shift_qh, 8);
|
||||
|
||||
// Mask: We want bits 11-12. 0x1800 = 0001 1000 0000 0000
|
||||
v_qh_expanded = __riscv_vand_vx_u16mf2(v_qh_expanded, 0x1800, 8);
|
||||
vuint16mf2_t v_qs_u16 = __riscv_vwcvtu_x_x_v_u16mf2(v_qs_u8, 8);
|
||||
|
||||
// Multiply by 8 to get byte offset, instead of element offset
|
||||
v_qs_u16 = __riscv_vsll_vx_u16mf2(v_qs_u16, 3, 8);
|
||||
vuint16mf2_t v_grid_offsets = __riscv_vor_vv_u16mf2(v_qs_u16, v_qh_expanded, 8);
|
||||
|
||||
// Lookup Grid using Byte Offsets
|
||||
vuint64m2_t v_grid_vals = __riscv_vluxei16_v_u64m2(grid64, v_grid_offsets, 8);
|
||||
|
||||
vuint8m2_t v_grid_u8 = __riscv_vreinterpret_v_u64m2_u8m2(v_grid_vals);
|
||||
vint8m2_t v_grid_i8 = __riscv_vreinterpret_v_u8m2_i8m2(v_grid_u8);
|
||||
|
||||
// Load signs and generate sign mask
|
||||
vuint8mf4_t v_signs_raw = __riscv_vle8_v_u8mf4(signs_ptr, 8);
|
||||
signs_ptr += 8;
|
||||
|
||||
vuint8m2_t v_signs_source = __riscv_vlmul_ext_v_u8mf4_u8m2(v_signs_raw);
|
||||
vuint8m2_t v_signs_bcast = __riscv_vrgather_vv_u8m2(v_signs_source, v_sign_gather_indices, 64);
|
||||
|
||||
vuint8m2_t v_sign_bits = __riscv_vand_vv_u8m2(v_signs_bcast, v_sign_masks, 64);
|
||||
vbool4_t m_negative = __riscv_vmsne_vx_u8m2_b4(v_sign_bits, 0, 64);
|
||||
|
||||
vint8m2_t v_q8 = __riscv_vle8_v_i8m2(q8, 64);
|
||||
q8 += 64;
|
||||
|
||||
vint8m2_t v_q8_signed = __riscv_vrsub_vx_i8m2_mu(m_negative, v_q8, v_q8, 0, 64);
|
||||
vint16m4_t v_dot = __riscv_vwmul_vv_i16m4(v_grid_i8, v_q8_signed, 64);
|
||||
|
||||
vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, 1);
|
||||
|
||||
int32_t s0 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1(
|
||||
__riscv_vget_v_i16m4_i16m1(v_dot, 0), v_zero, 16));
|
||||
int32_t s1 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1(
|
||||
__riscv_vget_v_i16m4_i16m1(v_dot, 1), v_zero, 16));
|
||||
int32_t s2 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1(
|
||||
__riscv_vget_v_i16m4_i16m1(v_dot, 2), v_zero, 16));
|
||||
int32_t s3 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1(
|
||||
__riscv_vget_v_i16m4_i16m1(v_dot, 3), v_zero, 16));
|
||||
|
||||
uint8_t sc0 = scales[0];
|
||||
uint8_t sc1 = scales[1];
|
||||
scales += 2;
|
||||
|
||||
sum_block += s0 * (2 * (sc0 & 0xF) + 1);
|
||||
sum_block += s1 * (2 * (sc0 >> 4) + 1);
|
||||
sum_block += s2 * (2 * (sc1 & 0xF) + 1);
|
||||
sum_block += s3 * (2 * (sc1 >> 4) + 1);
|
||||
}
|
||||
sumf += sum_block * combined_scale;
|
||||
}
|
||||
*s = 0.125f * sumf;
|
||||
}
|
||||
|
||||
static void ggml_vec_dot_iq2_s_q8_K_vl128(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(n % QK_K == 0);
|
||||
UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs);
|
||||
|
||||
const block_iq2_s * GGML_RESTRICT x = vx;
|
||||
const block_q8_K * GGML_RESTRICT y = vy;
|
||||
|
||||
const int nb = n / QK_K;
|
||||
const uint64_t * grid64 = (const uint64_t *)iq2s_grid;
|
||||
|
||||
// Pre-load Constants
|
||||
vuint8m2_t v_ids = __riscv_vid_v_u8m2(32);
|
||||
vuint8m2_t v_sign_gather_indices = __riscv_vsrl_vx_u8m2(v_ids, 3, 32);
|
||||
vuint8m2_t v_ones = __riscv_vmv_v_x_u8m2(1, 32);
|
||||
vuint8m2_t v_shift_amts = __riscv_vand_vx_u8m2(v_ids, 7, 32);
|
||||
vuint8m2_t v_sign_masks = __riscv_vsll_vv_u8m2(v_ones, v_shift_amts, 32);
|
||||
uint16_t shift_qh_arr[4] = {11, 9, 7, 5};
|
||||
vuint16mf2_t v_shift_qh = __riscv_vle16_v_u16mf2(shift_qh_arr, 4);
|
||||
|
||||
float sumf = 0.0f;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float combined_scale = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
|
||||
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const uint8_t * GGML_RESTRICT scales = x[i].scales;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
|
||||
const uint8_t * signs_ptr = qs + 32;
|
||||
float sum_block = 0.0f;
|
||||
|
||||
for (int ib = 0; ib < 8; ++ib) {
|
||||
|
||||
// Load Low Bits [4 bytes]
|
||||
vuint8mf4_t v_qs_u8 = __riscv_vle8_v_u8mf4(qs, 4);
|
||||
qs += 4;
|
||||
|
||||
// Load 1 byte. It contains bits for 4 mini-blocks.
|
||||
uint8_t qh_val = *qh++;
|
||||
|
||||
// Combine Low + High bits of 10bit indices
|
||||
vuint8mf4_t v_qh_raw = __riscv_vmv_v_x_u8mf4(qh_val, 4);
|
||||
vuint16mf2_t v_qh_u16 = __riscv_vwcvtu_x_x_v_u16mf2(v_qh_raw, 4);
|
||||
vuint16mf2_t v_qh_mf2 = __riscv_vsll_vv_u16mf2(v_qh_u16, v_shift_qh, 4);
|
||||
v_qh_mf2 = __riscv_vand_vx_u16mf2(v_qh_mf2, 0x1800, 4);
|
||||
vuint16mf2_t v_qs_u16_mf2 = __riscv_vwcvtu_x_x_v_u16mf2(v_qs_u8, 4);
|
||||
vuint16mf2_t v_qs_u16 = __riscv_vsll_vx_u16mf2(v_qs_u16_mf2, 3, 4);
|
||||
vuint16mf2_t v_grid_offsets = __riscv_vor_vv_u16mf2(v_qs_u16, v_qh_mf2, 4);
|
||||
|
||||
// Lookup Grid
|
||||
vint8m2_t v_grid_i8 = __riscv_vreinterpret_v_u8m2_i8m2(__riscv_vreinterpret_v_u64m2_u8m2(__riscv_vluxei16_v_u64m2(grid64, v_grid_offsets, 4)));
|
||||
|
||||
vuint8mf4_t v_signs_raw = __riscv_vle8_v_u8mf4(signs_ptr, 4);
|
||||
signs_ptr += 4;
|
||||
vuint8m2_t v_signs_source = __riscv_vlmul_ext_v_u8mf4_u8m2(v_signs_raw);
|
||||
vuint8m2_t v_signs_bcast = __riscv_vrgather_vv_u8m2(v_signs_source, v_sign_gather_indices, 32);
|
||||
|
||||
// generating sign mask
|
||||
vuint8m2_t v_sign_bits = __riscv_vand_vv_u8m2(v_signs_bcast, v_sign_masks, 32);
|
||||
vbool4_t m_negative = __riscv_vmsne_vx_u8m2_b4(v_sign_bits, 0, 32);
|
||||
|
||||
vint8m2_t v_q8 = __riscv_vle8_v_i8m2(q8, 32);
|
||||
q8 += 32;
|
||||
|
||||
// apply signs
|
||||
vint8m2_t v_q8_signed = __riscv_vrsub_vx_i8m2_mu(m_negative,v_q8, v_q8, 0, 32);
|
||||
vint16m4_t v_dot = __riscv_vwmul_vv_i16m4(v_grid_i8, v_q8_signed, 32);
|
||||
|
||||
// Reduction
|
||||
vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, 1);
|
||||
|
||||
// Reduce 0-15 (First Half)
|
||||
int32_t s0 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(
|
||||
__riscv_vget_v_i16m4_i16m2(v_dot, 0), v_zero, 16));
|
||||
|
||||
// Reduce 16-31 (Second Half)
|
||||
int32_t s1 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(
|
||||
__riscv_vget_v_i16m4_i16m2(v_dot, 1), v_zero, 16));
|
||||
|
||||
// Apply sub Scales
|
||||
uint8_t sc = *scales++;
|
||||
|
||||
sum_block += s0 * (2 * (sc & 0xF) + 1);
|
||||
sum_block += s1 * (2 * (sc >> 4) + 1);
|
||||
}
|
||||
sumf += sum_block * combined_scale;
|
||||
}
|
||||
*s = 0.125f * sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
#if defined __riscv_v_intrinsic
|
||||
switch (__riscv_vlenb() * 8) {
|
||||
case 128:
|
||||
ggml_vec_dot_iq2_s_q8_K_vl128(n, s, bs, vx, bx, vy, by, nrc);
|
||||
break;
|
||||
case 256:
|
||||
ggml_vec_dot_iq2_s_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
|
||||
break;
|
||||
default:
|
||||
ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
break;
|
||||
}
|
||||
#else
|
||||
ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_vec_dot_iq3_s_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(n % QK_K == 0);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
|
||||
const block_iq3_s * GGML_RESTRICT x = vx;
|
||||
const block_q8_K * GGML_RESTRICT y = vy;
|
||||
|
||||
const int nb = n / QK_K;
|
||||
|
||||
const uint64_t * grid64 = (const uint64_t *)iq3s_grid;
|
||||
|
||||
// --- Pre-load Constants ---
|
||||
const uint16_t qh_bit_shifts_arr[16] = {
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
||||
};
|
||||
vuint8m2_t v_sign_gather_indices = __riscv_vle8_v_u8m2(sign_gather_indices_arr, 64);
|
||||
vuint8m2_t v_sign_masks = __riscv_vle8_v_u8m2(sign_bit_masks_arr, 64);
|
||||
vuint16m1_t v_qh_shifts = __riscv_vle16_v_u16m1(qh_bit_shifts_arr, 16);
|
||||
|
||||
float sumf = 0.0f;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float combined_scale = d * y[i].d;
|
||||
|
||||
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const uint8_t * GGML_RESTRICT scales = x[i].scales;
|
||||
const uint8_t * GGML_RESTRICT signs = x[i].signs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
|
||||
float sum_block = 0.0f;
|
||||
|
||||
// Loop: Process 64 weights (16 mini-blocks of 4) per iteration
|
||||
for (int ib = 0; ib < 4; ++ib) {
|
||||
|
||||
vuint8mf2_t v_qs_u8 = __riscv_vle8_v_u8mf2(qs, 16);
|
||||
qs += 16;
|
||||
|
||||
uint16_t qh_val;
|
||||
memcpy(&qh_val, qh, 2);
|
||||
qh += 2;
|
||||
|
||||
vuint16m1_t v_qh_val = __riscv_vmv_v_x_u16m1(qh_val, 16);
|
||||
// Extract bits: (qh >> i) & 1
|
||||
v_qh_val = __riscv_vsrl_vv_u16m1(v_qh_val, v_qh_shifts, 16);
|
||||
v_qh_val = __riscv_vand_vx_u16m1(v_qh_val, 1, 16);
|
||||
|
||||
vuint16m1_t v_qs_u16 = __riscv_vwcvtu_x_x_v_u16m1(v_qs_u8, 16);
|
||||
v_qs_u16 = __riscv_vsll_vx_u16m1(v_qs_u16, 2, 16);
|
||||
v_qh_val = __riscv_vsll_vx_u16m1(v_qh_val, 10, 16);
|
||||
vuint16m1_t v_grid_offsets = __riscv_vor_vv_u16m1(v_qs_u16, v_qh_val, 16);
|
||||
|
||||
// Grid value is 4xuint8
|
||||
vuint32m2_t v_grid_packed = __riscv_vluxei16_v_u32m2((const uint32_t *)grid64, v_grid_offsets, 16);
|
||||
vuint8m2_t v_grid_u8 = __riscv_vreinterpret_v_u32m2_u8m2(v_grid_packed);
|
||||
vuint8mf4_t v_signs_raw = __riscv_vle8_v_u8mf4(signs, 8);
|
||||
signs += 8;
|
||||
|
||||
// Generate sign mask
|
||||
vuint8m2_t v_signs_source = __riscv_vlmul_ext_v_u8mf4_u8m2(v_signs_raw);
|
||||
vuint8m2_t v_signs_bcast = __riscv_vrgather_vv_u8m2(v_signs_source, v_sign_gather_indices, 64);
|
||||
vuint8m2_t v_sign_bits = __riscv_vand_vv_u8m2(v_signs_bcast, v_sign_masks, 64);
|
||||
vbool4_t m_negative = __riscv_vmsne_vx_u8m2_b4(v_sign_bits, 0, 64);
|
||||
|
||||
vint8m2_t v_q8 = __riscv_vle8_v_i8m2(q8, 64);
|
||||
q8 += 64;
|
||||
|
||||
// Apply Signs
|
||||
vint8m2_t v_q8_signed = __riscv_vrsub_vx_i8m2_mu(m_negative, v_q8, v_q8, 0, 64);
|
||||
vint16m4_t v_dot = __riscv_vwmulsu_vv_i16m4(v_q8_signed, v_grid_u8, 64);
|
||||
|
||||
// Reduction
|
||||
vint16m2_t v_dot_lo = __riscv_vget_v_i16m4_i16m2(v_dot, 0);
|
||||
vint16m2_t v_dot_hi = __riscv_vget_v_i16m4_i16m2(v_dot, 1);
|
||||
vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, 1);
|
||||
|
||||
int32_t s_lo = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(v_dot_lo, v_zero, 32));
|
||||
int32_t s_hi = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(v_dot_hi, v_zero, 32));
|
||||
|
||||
// Apply sub-scales
|
||||
uint8_t sc_byte = *scales++;
|
||||
int sc_lo = (sc_byte & 0xF) * 2 + 1;
|
||||
int sc_hi = (sc_byte >> 4) * 2 + 1;
|
||||
|
||||
sum_block += s_lo * sc_lo + s_hi * sc_hi;
|
||||
}
|
||||
sumf += sum_block * combined_scale;
|
||||
}
|
||||
*s = 0.125f * sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
#if defined __riscv_v_intrinsic
|
||||
switch (__riscv_vlenb() * 8) {
|
||||
case 256:
|
||||
ggml_vec_dot_iq3_s_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
|
||||
break;
|
||||
default:
|
||||
ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
break;
|
||||
}
|
||||
#else
|
||||
ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_vec_dot_tq1_0_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
|
||||
const block_tq1_0 * GGML_RESTRICT x = vx;
|
||||
const block_q8_K * GGML_RESTRICT y = vy;
|
||||
|
||||
const int nb = n / QK_K;
|
||||
|
||||
float sumf = 0.0f;
|
||||
uint8_t pow[16] = {1, 1, 1, 1, 3, 3, 3, 3, 9, 9, 9, 9, 27, 27, 27, 27};
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
// First loop.
|
||||
vint32m4_t suml1;
|
||||
{
|
||||
const int vl = 32;
|
||||
vuint8m1_t tq = __riscv_vle8_v_u8m1(x[i].qs, vl);
|
||||
|
||||
vuint16m2_t tq0 = __riscv_vsrl_vx_u16m2(__riscv_vwmulu_vx_u16m2(tq, 3, vl), 8, vl);
|
||||
vuint16m2_t tq1 = __riscv_vsrl_vx_u16m2(__riscv_vwmulu_vx_u16m2(__riscv_vmul_vx_u8m1(tq, 3, vl), 3, vl), 8, vl);
|
||||
vuint16m2_t tq2 = __riscv_vsrl_vx_u16m2(__riscv_vwmulu_vx_u16m2(__riscv_vmul_vx_u8m1(tq, 9, vl), 3, vl), 8, vl);
|
||||
vuint16m2_t tq3 = __riscv_vsrl_vx_u16m2(__riscv_vwmulu_vx_u16m2(__riscv_vmul_vx_u8m1(tq, 27, vl), 3, vl), 8, vl);
|
||||
vuint16m2_t tq4 = __riscv_vsrl_vx_u16m2(__riscv_vwmulu_vx_u16m2(__riscv_vmul_vx_u8m1(tq, 81, vl), 3, vl), 8, vl);
|
||||
|
||||
vint16m2_t q80 = __riscv_vwcvt_x_x_v_i16m2(__riscv_vle8_v_i8m1(y[i].qs + 0, vl), vl);
|
||||
vint16m2_t q81 = __riscv_vwcvt_x_x_v_i16m2(__riscv_vle8_v_i8m1(y[i].qs + 32, vl), vl);
|
||||
vint16m2_t q82 = __riscv_vwcvt_x_x_v_i16m2(__riscv_vle8_v_i8m1(y[i].qs + 64, vl), vl);
|
||||
vint16m2_t q83 = __riscv_vwcvt_x_x_v_i16m2(__riscv_vle8_v_i8m1(y[i].qs + 96, vl), vl);
|
||||
vint16m2_t q84 = __riscv_vwcvt_x_x_v_i16m2(__riscv_vle8_v_i8m1(y[i].qs + 128, vl), vl);
|
||||
|
||||
vint16m2_t sum0 = __riscv_vmul_vv_i16m2(__riscv_vreinterpret_v_u16m2_i16m2(__riscv_vsub_vx_u16m2(tq0, 1, vl)), q80, vl);
|
||||
vint16m2_t sum1 = __riscv_vmul_vv_i16m2(__riscv_vreinterpret_v_u16m2_i16m2(__riscv_vsub_vx_u16m2(tq1, 1, vl)), q81, vl);
|
||||
vint16m2_t sum2 = __riscv_vmul_vv_i16m2(__riscv_vreinterpret_v_u16m2_i16m2(__riscv_vsub_vx_u16m2(tq2, 1, vl)), q82, vl);
|
||||
vint16m2_t sum3 = __riscv_vmul_vv_i16m2(__riscv_vreinterpret_v_u16m2_i16m2(__riscv_vsub_vx_u16m2(tq3, 1, vl)), q83, vl);
|
||||
vint16m2_t sum4 = __riscv_vmul_vv_i16m2(__riscv_vreinterpret_v_u16m2_i16m2(__riscv_vsub_vx_u16m2(tq4, 1, vl)), q84, vl);
|
||||
|
||||
vint32m4_t sumi0 = __riscv_vwadd_vv_i32m4(sum0, sum1, vl);
|
||||
vint32m4_t sumi1 = __riscv_vwadd_vv_i32m4(sum2, sum3, vl);
|
||||
suml1 = __riscv_vadd_vv_i32m4(__riscv_vwcvt_x_x_v_i32m4(sum4, vl), __riscv_vadd_vv_i32m4(sumi0, sumi1, vl), vl);
|
||||
}
|
||||
|
||||
// Second loop.
|
||||
vint32m2_t suml2;
|
||||
{
|
||||
const int vl = 16;
|
||||
vuint8mf2_t tq = __riscv_vle8_v_u8mf2(x[i].qs + 32, vl);
|
||||
|
||||
vuint16m1_t tq0 = __riscv_vsrl_vx_u16m1(__riscv_vwmulu_vx_u16m1(tq, 3 * 1, vl), 8, vl);
|
||||
vuint16m1_t tq1 = __riscv_vsrl_vx_u16m1(__riscv_vwmulu_vx_u16m1(__riscv_vmul_vx_u8mf2(tq, 3, vl), 3, vl), 8, vl);
|
||||
vuint16m1_t tq2 = __riscv_vsrl_vx_u16m1(__riscv_vwmulu_vx_u16m1(__riscv_vmul_vx_u8mf2(tq, 9, vl), 3, vl), 8, vl);
|
||||
vuint16m1_t tq3 = __riscv_vsrl_vx_u16m1(__riscv_vwmulu_vx_u16m1(__riscv_vmul_vx_u8mf2(tq, 27, vl), 3, vl), 8, vl);
|
||||
vuint16m1_t tq4 = __riscv_vsrl_vx_u16m1(__riscv_vwmulu_vx_u16m1(__riscv_vmul_vx_u8mf2(tq, 81, vl), 3, vl), 8, vl);
|
||||
|
||||
vint16m1_t q80 = __riscv_vwcvt_x_x_v_i16m1(__riscv_vle8_v_i8mf2(y[i].qs + 160, vl), vl);
|
||||
vint16m1_t q81 = __riscv_vwcvt_x_x_v_i16m1(__riscv_vle8_v_i8mf2(y[i].qs + 176, vl), vl);
|
||||
vint16m1_t q82 = __riscv_vwcvt_x_x_v_i16m1(__riscv_vle8_v_i8mf2(y[i].qs + 192, vl), vl);
|
||||
vint16m1_t q83 = __riscv_vwcvt_x_x_v_i16m1(__riscv_vle8_v_i8mf2(y[i].qs + 208, vl), vl);
|
||||
vint16m1_t q84 = __riscv_vwcvt_x_x_v_i16m1(__riscv_vle8_v_i8mf2(y[i].qs + 224, vl), vl);
|
||||
|
||||
vint16m1_t sum0 = __riscv_vmul_vv_i16m1(__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vsub_vx_u16m1(tq0, 1, vl)), q80, vl);
|
||||
vint16m1_t sum1 = __riscv_vmul_vv_i16m1(__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vsub_vx_u16m1(tq1, 1, vl)), q81, vl);
|
||||
vint16m1_t sum2 = __riscv_vmul_vv_i16m1(__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vsub_vx_u16m1(tq2, 1, vl)), q82, vl);
|
||||
vint16m1_t sum3 = __riscv_vmul_vv_i16m1(__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vsub_vx_u16m1(tq3, 1, vl)), q83, vl);
|
||||
vint16m1_t sum4 = __riscv_vmul_vv_i16m1(__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vsub_vx_u16m1(tq4, 1, vl)), q84, vl);
|
||||
|
||||
vint32m2_t sumi0 = __riscv_vwadd_vv_i32m2(sum0, sum1, vl);
|
||||
vint32m2_t sumi1 = __riscv_vwadd_vv_i32m2(sum2, sum3, vl);
|
||||
suml2 = __riscv_vadd_vv_i32m2(__riscv_vwcvt_x_x_v_i32m2(sum4, vl), __riscv_vadd_vv_i32m2(sumi0, sumi1, vl), vl);
|
||||
}
|
||||
|
||||
// Third loop.
|
||||
vint32m2_t suml3;
|
||||
{
|
||||
const int vl = 16;
|
||||
|
||||
uint32_t qh;
|
||||
memcpy(&qh, &x[i].qh[0], 4);
|
||||
// Prevent fusion with vmv.
|
||||
__asm__ __volatile__("" : "+r"(qh));
|
||||
vuint8mf2_t tq = __riscv_vreinterpret_v_u32mf2_u8mf2(__riscv_vmv_v_x_u32mf2(qh, vl / 4));
|
||||
|
||||
vuint8mf2_t p = __riscv_vle8_v_u8mf2(pow, vl);
|
||||
|
||||
vuint16m1_t tq0 = __riscv_vsrl_vx_u16m1(__riscv_vwmulu_vx_u16m1(__riscv_vmul_vv_u8mf2(tq, p, vl), 3, vl), 8, vl);
|
||||
|
||||
vint16m1_t q80 = __riscv_vwcvt_x_x_v_i16m1(__riscv_vle8_v_i8mf2(y[i].qs + 240, vl), vl);
|
||||
|
||||
vint16m1_t sum0 = __riscv_vmul_vv_i16m1(__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vsub_vx_u16m1(tq0, 1, vl)), q80, vl);
|
||||
suml3 = __riscv_vwcvt_x_x_v_i32m2(sum0, vl);
|
||||
}
|
||||
|
||||
vint32m2_t sumb = __riscv_vadd_vv_i32m2(__riscv_vget_v_i32m4_i32m2(suml1, 0), __riscv_vget_v_i32m4_i32m2(suml1, 1), 16);
|
||||
sumb = __riscv_vadd_vv_i32m2(sumb, suml2, 16);
|
||||
sumb = __riscv_vadd_vv_i32m2(sumb, suml3, 16);
|
||||
|
||||
vint32m1_t sum = __riscv_vredsum_vs_i32m2_i32m1(sumb, __riscv_vmv_v_x_i32m1(0, 1), 16);
|
||||
sumf += __riscv_vmv_x_s_i32m1_i32(sum) * y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
#if defined __riscv_v_intrinsic
|
||||
switch (__riscv_vlenb() * 8) {
|
||||
case 256:
|
||||
ggml_vec_dot_tq1_0_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
|
||||
break;
|
||||
default:
|
||||
ggml_vec_dot_tq1_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
break;
|
||||
}
|
||||
#else
|
||||
ggml_vec_dot_tq1_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_vec_dot_tq2_0_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
|
||||
const block_tq2_0 * GGML_RESTRICT x = vx;
|
||||
const block_q8_K * GGML_RESTRICT y = vy;
|
||||
|
||||
const int nb = n / QK_K;
|
||||
|
||||
float sumf = 0.0f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
int32_t sumi = 0;
|
||||
|
||||
for (size_t j = 0; j < sizeof(x[0].qs); j += 32) {
|
||||
const int8_t * py0 = &y[i].qs[j * 4 + 0 * 32];
|
||||
const int8_t * py1 = &y[i].qs[j * 4 + 1 * 32];
|
||||
const int8_t * py2 = &y[i].qs[j * 4 + 2 * 32];
|
||||
const int8_t * py3 = &y[i].qs[j * 4 + 3 * 32];
|
||||
const uint8_t* px = &x[i].qs[j];
|
||||
|
||||
size_t vlmax_16m2 = __riscv_vsetvl_e16m2(32);
|
||||
vint16m2_t vacc16 = __riscv_vmv_v_x_i16m2(0, vlmax_16m2);
|
||||
|
||||
size_t vl = __riscv_vsetvl_e8m1(32);
|
||||
|
||||
vuint8m1_t vx_u8 = __riscv_vle8_v_u8m1(px, vl);
|
||||
|
||||
vint8m1_t vy0 = __riscv_vle8_v_i8m1(py0 , vl);
|
||||
vint8m1_t vy1 = __riscv_vle8_v_i8m1(py1, vl);
|
||||
vint8m1_t vy2 = __riscv_vle8_v_i8m1(py2, vl);
|
||||
vint8m1_t vy3 = __riscv_vle8_v_i8m1(py3, vl);
|
||||
|
||||
// l=0 (bits 1:0)
|
||||
vuint8m1_t t0 = __riscv_vand_vx_u8m1(vx_u8, 0x03, vl);
|
||||
vint8m1_t vq0 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(t0), 1, vl);
|
||||
|
||||
// l=1 (bits 3:2)
|
||||
vuint8m1_t t1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(vx_u8, 2, vl), 0x03, vl);
|
||||
vint8m1_t vq1 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(t1), 1, vl);
|
||||
|
||||
// l=2 (bits 5:4)
|
||||
vuint8m1_t t2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(vx_u8, 4, vl), 0x03, vl);
|
||||
vint8m1_t vq2 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(t2), 1, vl);
|
||||
|
||||
// l=3 (bits 7:6)
|
||||
vuint8m1_t t3 = __riscv_vsrl_vx_u8m1(vx_u8, 6, vl); // No final AND needed as vsrl shifts in zeros
|
||||
vint8m1_t vq3 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(t3), 1, vl);
|
||||
|
||||
// 4. Multiply and accumulate
|
||||
vacc16 = __riscv_vwmacc_vv_i16m2(vacc16, vq0, vy0, vl);
|
||||
vacc16 = __riscv_vwmacc_vv_i16m2(vacc16, vq1, vy1, vl);
|
||||
vacc16 = __riscv_vwmacc_vv_i16m2(vacc16, vq2, vy2, vl);
|
||||
vacc16 = __riscv_vwmacc_vv_i16m2(vacc16, vq3, vy3, vl);
|
||||
|
||||
vlmax_16m2 = __riscv_vsetvl_e16m2(32);
|
||||
vint32m1_t vzero32 = __riscv_vmv_v_x_i32m1(0, 1);
|
||||
vint32m1_t vred32 = __riscv_vwredsum_vs_i16m2_i32m1(vacc16, vzero32, vlmax_16m2);
|
||||
|
||||
sumi += __riscv_vmv_x_s_i32m1_i32(vred32);
|
||||
}
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
sumf += (float)sumi * d;
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
#if defined __riscv_v_intrinsic
|
||||
switch (__riscv_vlenb() * 8) {
|
||||
case 256:
|
||||
ggml_vec_dot_tq2_0_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
|
||||
break;
|
||||
default:
|
||||
ggml_vec_dot_tq2_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
break;
|
||||
}
|
||||
#else
|
||||
ggml_vec_dot_tq2_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_vec_dot_iq1_s_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
|
||||
const block_iq1_s * GGML_RESTRICT x = vx;
|
||||
const block_q8_K * GGML_RESTRICT y = vy;
|
||||
|
||||
const int nb = n / QK_K;
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
// Load qh once for the entire superblock.
|
||||
vuint16mf2_t qh = __riscv_vle16_v_u16mf2(x[i].qh, 8);
|
||||
|
||||
// Calculate ls.
|
||||
vuint16mf2_t temp = __riscv_vsrl_vx_u16mf2(qh, 12, 8);
|
||||
temp = __riscv_vand_vx_u16mf2(temp, 7, 8);
|
||||
vint32m1_t ls = __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vwmulu_vx_u32m1(temp, 2, 8));
|
||||
ls = __riscv_vadd_vx_i32m1(ls, 1, 8);
|
||||
|
||||
// Calculate delta.
|
||||
vbool32_t mask = __riscv_vmseq_vx_u16mf2_b32(__riscv_vand_vx_u16mf2(qh, 0x8000, 8), 0, 8);
|
||||
vint32m1_t delta_neg = __riscv_vmv_v_x_i32m1(-1, 8);
|
||||
vint32m1_t delta_pos = __riscv_vmv_v_x_i32m1(1, 8);
|
||||
vint32m1_t delta = __riscv_vmerge_vvm_i32m1(delta_neg, delta_pos, mask, 8);
|
||||
|
||||
// Load qs.
|
||||
vuint8m1_t qs = __riscv_vle8_v_u8m1(x[i].qs, 32);
|
||||
|
||||
// Prepare the indices.
|
||||
const uint64_t shift = 0x0009000600030000;
|
||||
vuint16m2_t qh_shift = __riscv_vreinterpret_v_u64m2_u16m2(__riscv_vmv_v_x_u64m2(shift, 8));
|
||||
vuint16m2_t qh_gather_index = __riscv_vreinterpret_v_i16m2_u16m2(
|
||||
__riscv_vdiv_vx_i16m2(__riscv_vreinterpret_v_u16m2_i16m2(__riscv_vid_v_u16m2(32)), 4, 32));
|
||||
vuint16m2_t qh_ext = __riscv_vlmul_ext_v_u16m1_u16m2(__riscv_vlmul_ext_v_u16mf2_u16m1(qh));
|
||||
vuint16m2_t qh_index = __riscv_vrgather_vv_u16m2(qh_ext, qh_gather_index, 32);
|
||||
qh_index = __riscv_vsrl_vv_u16m2(qh_index, qh_shift, 32);
|
||||
qh_index = __riscv_vand_vx_u16m2(qh_index, 7, 32);
|
||||
qh_index = __riscv_vsll_vx_u16m2(qh_index, 8, 32);
|
||||
qh_index = __riscv_vor_vv_u16m2(qh_index, __riscv_vzext_vf2_u16m2(qs, 32), 32);
|
||||
vuint16m2_t index = __riscv_vsll_vx_u16m2(qh_index, 3, 32);
|
||||
|
||||
// Final lsums.
|
||||
int32_t lsums_s[8];
|
||||
vint32m1_t one_scalar = __riscv_vmv_v_x_i32m1(0, 1);
|
||||
|
||||
// Sub-blocks 1-4
|
||||
{
|
||||
vuint16m1_t grid_index0 = __riscv_vget_v_u16m2_u16m1(index, 0);
|
||||
vint8m4_t grid0 = __riscv_vreinterpret_v_i64m4_i8m4(__riscv_vluxei16_v_i64m4((const int64_t*)iq1s_grid, grid_index0, 16));
|
||||
vint8m4_t q80 = __riscv_vle8_v_i8m4(y[i].qs, 128);
|
||||
vint16m8_t lsum0 = __riscv_vwmul_vv_i16m8(grid0, q80, 128);
|
||||
lsums_s[0] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum0, 0), one_scalar, 32));
|
||||
lsums_s[1] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum0, 1), one_scalar, 32));
|
||||
lsums_s[2] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum0, 2), one_scalar, 32));
|
||||
lsums_s[3] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum0, 3), one_scalar, 32));
|
||||
}
|
||||
__asm__ __volatile__("" ::: "memory");
|
||||
// Sub-blocks 5-8
|
||||
{
|
||||
vuint16m1_t grid_index1 = __riscv_vget_v_u16m2_u16m1(index, 1);
|
||||
vint8m4_t grid1 = __riscv_vreinterpret_v_i64m4_i8m4(__riscv_vluxei16_v_i64m4((const int64_t*)iq1s_grid, grid_index1, 16));
|
||||
vint8m4_t q81 = __riscv_vle8_v_i8m4(&y[i].qs[128], 128);
|
||||
vint16m8_t lsum1 = __riscv_vwmul_vv_i16m8(grid1, q81, 128);
|
||||
lsums_s[4] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum1, 0), one_scalar, 32));
|
||||
lsums_s[5] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum1, 1), one_scalar, 32));
|
||||
lsums_s[6] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum1, 2), one_scalar, 32));
|
||||
lsums_s[7] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum1, 3), one_scalar, 32));
|
||||
}
|
||||
__asm__ __volatile__("" ::: "memory");
|
||||
vint32m1_t lsums = __riscv_vle32_v_i32m1(&lsums_s[0], 8);
|
||||
|
||||
// Calculate the bsums.
|
||||
vint16m1_t bsums_0 = __riscv_vle16_v_i16m1(y[i].bsums, 16);
|
||||
const vuint32m1_t bsums_i32 = __riscv_vreinterpret_v_u16m1_u32m1(__riscv_vreinterpret_v_i16m1_u16m1(bsums_0));
|
||||
const vint16mf2_t bsums_i32_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(bsums_i32, 0, 8));
|
||||
const vint16mf2_t bsums_i32_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(bsums_i32, 16, 8));
|
||||
const vint32m1_t bsums = __riscv_vwadd_vv_i32m1(bsums_i32_0, bsums_i32_1, 8);
|
||||
|
||||
// Accumulation.
|
||||
vint32m1_t sumi_v = __riscv_vmul_vv_i32m1(ls, lsums, 8);
|
||||
vint32m1_t sumi1_v = __riscv_vmul_vv_i32m1(__riscv_vmul_vv_i32m1(ls, delta, 8), bsums, 8);
|
||||
|
||||
// Update sumf.
|
||||
int sumi = __riscv_vmv_x_s_i32m1_i32(__riscv_vredsum_vs_i32m1_i32m1(sumi_v, __riscv_vmv_v_x_i32m1(0.0f, 1), 8));
|
||||
int sumi1 = __riscv_vmv_x_s_i32m1_i32(__riscv_vredsum_vs_i32m1_i32m1(sumi1_v, __riscv_vmv_v_x_i32m1(0.0f, 1), 8));
|
||||
sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
#if defined __riscv_v_intrinsic
|
||||
switch (__riscv_vlenb() * 8) {
|
||||
case 256:
|
||||
ggml_vec_dot_iq1_s_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
|
||||
break;
|
||||
default:
|
||||
ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
break;
|
||||
}
|
||||
#else
|
||||
ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_vec_dot_iq1_m_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
|
||||
const block_iq1_m * GGML_RESTRICT x = vx;
|
||||
const block_q8_K * GGML_RESTRICT y = vy;
|
||||
|
||||
const int nb = n / QK_K;
|
||||
|
||||
iq1m_scale_t scale;
|
||||
float sumf = 0.0f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * qs = x[i].qs;
|
||||
const uint8_t * qh = x[i].qh;
|
||||
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
||||
|
||||
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
||||
|
||||
// Accumulators.
|
||||
vint32m2_t acc1 = __riscv_vmv_v_x_i32m2(0, 16);
|
||||
vint32m2_t acc2 = __riscv_vmv_v_x_i32m2(0, 16);
|
||||
|
||||
// We process 4 sub-blocks together.
|
||||
for (int ib = 0; ib < QK_K/128; ib++) {
|
||||
// Load qh for 4 sub-blocks.
|
||||
const vuint8mf4_t qh_8 = __riscv_vle8_v_u8mf4(qh, 8);
|
||||
const vuint16mf2_t qh_16_lo = __riscv_vzext_vf2_u16mf2(qh_8, 8);
|
||||
const vuint16mf2_t qh_16_hi = __riscv_vsll_vx_u16mf2(qh_16_lo, 8, 8);
|
||||
const vuint16m1_t qhb = __riscv_vzext_vf2_u16m1(
|
||||
__riscv_vreinterpret_v_u16mf2_u8mf2(__riscv_vor_vv_u16mf2(qh_16_lo, qh_16_hi, 8)), 16);
|
||||
qh += 8;
|
||||
|
||||
// Prepare grid indices.
|
||||
const vuint16m1_t qsb = __riscv_vzext_vf2_u16m1(__riscv_vle8_v_u8mf2(&qs[0], 16), 16);
|
||||
const vuint16m1_t shift = __riscv_vreinterpret_v_u32m1_u16m1(__riscv_vmv_v_x_u32m1(0x00040008, 8));
|
||||
vuint16m1_t index = __riscv_vor_vv_u16m1(qsb, __riscv_vand_vx_u16m1(__riscv_vsll_vv_u16m1(qhb, shift, 16), 0x700, 16), 16);
|
||||
index = __riscv_vsll_vx_u16m1(index, 3, 16);
|
||||
qs += 16;
|
||||
|
||||
// Load the grid.
|
||||
const vint8m4_t iq1b = __riscv_vreinterpret_v_i64m4_i8m4(__riscv_vreinterpret_v_u64m4_i64m4(
|
||||
__riscv_vluxei16_v_u64m4(iq1s_grid, index, 16)));
|
||||
|
||||
// Prepare the deltas.
|
||||
const vbool16_t mask = __riscv_vmsgtu_vx_u16m1_b16(
|
||||
__riscv_vand_vv_u16m1(qhb, __riscv_vreinterpret_v_u32m1_u16m1(__riscv_vmv_v_x_u32m1(0x00800008, 8)), 16), 0, 16);
|
||||
const vint64m4_t delta_pos = __riscv_vmv_v_x_i64m4(0x0101010101010101, 16);
|
||||
const vint64m4_t delta_neg = __riscv_vmv_v_x_i64m4(0xffffffffffffffff, 16);
|
||||
const vint8m4_t delta = __riscv_vreinterpret_v_i64m4_i8m4(
|
||||
__riscv_vmerge_vvm_i64m4(delta_pos, delta_neg, mask, 16));
|
||||
|
||||
// Load q8 for sub-blocks.
|
||||
const vint8m4_t q8b = __riscv_vle8_v_i8m4(q8, 128);
|
||||
q8 += 128;
|
||||
|
||||
// Calculate the lsums.
|
||||
const vint16m8_t lsum1 = __riscv_vwmul_vv_i16m8(iq1b, q8b, 128);
|
||||
const vint16m8_t lsum2 = __riscv_vwmul_vv_i16m8(delta, q8b, 128);
|
||||
|
||||
// Prepare the scales.
|
||||
const int16_t ls_0_0 = 2*((sc[0] >> 0) & 0x7) + 1;
|
||||
const int16_t ls_0_1 = 2*((sc[0] >> 3) & 0x7) + 1;
|
||||
const int16_t ls_1_0 = 2*((sc[0] >> 6) & 0x7) + 1;
|
||||
const int16_t ls_1_1 = 2*((sc[0] >> 9) & 0x7) + 1;
|
||||
const int16_t ls_2_0 = 2*((sc[1] >> 0) & 0x7) + 1;
|
||||
const int16_t ls_2_1 = 2*((sc[1] >> 3) & 0x7) + 1;
|
||||
const int16_t ls_3_0 = 2*((sc[1] >> 6) & 0x7) + 1;
|
||||
const int16_t ls_3_1 = 2*((sc[1] >> 9) & 0x7) + 1;
|
||||
sc += 2;
|
||||
|
||||
// Accumulate in acc0 and acc1 for each sub-block.
|
||||
acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_0_0, __riscv_vget_v_i16m8_i16m1(lsum1, 0), 16);
|
||||
acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_0_1, __riscv_vget_v_i16m8_i16m1(lsum1, 1), 16);
|
||||
acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_0_0, __riscv_vget_v_i16m8_i16m1(lsum2, 0), 16);
|
||||
acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_0_1, __riscv_vget_v_i16m8_i16m1(lsum2, 1), 16);
|
||||
//
|
||||
acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_1_0, __riscv_vget_v_i16m8_i16m1(lsum1, 2), 16);
|
||||
acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_1_1, __riscv_vget_v_i16m8_i16m1(lsum1, 3), 16);
|
||||
acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_1_0, __riscv_vget_v_i16m8_i16m1(lsum2, 2), 16);
|
||||
acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_1_1, __riscv_vget_v_i16m8_i16m1(lsum2, 3), 16);
|
||||
//
|
||||
acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_2_0, __riscv_vget_v_i16m8_i16m1(lsum1, 4), 16);
|
||||
acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_2_1, __riscv_vget_v_i16m8_i16m1(lsum1, 5), 16);
|
||||
acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_2_0, __riscv_vget_v_i16m8_i16m1(lsum2, 4), 16);
|
||||
acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_2_1, __riscv_vget_v_i16m8_i16m1(lsum2, 5), 16);
|
||||
//
|
||||
acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_3_0, __riscv_vget_v_i16m8_i16m1(lsum1, 6), 16);
|
||||
acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_3_1, __riscv_vget_v_i16m8_i16m1(lsum1, 7), 16);
|
||||
acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_3_0, __riscv_vget_v_i16m8_i16m1(lsum2, 6), 16);
|
||||
acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_3_1, __riscv_vget_v_i16m8_i16m1(lsum2, 7), 16);
|
||||
}
|
||||
|
||||
// Reduce and accumulate in `sumf`.
|
||||
vint32m1_t one = __riscv_vmv_v_x_i32m1(0, 1);
|
||||
int sumi1 = __riscv_vmv_x_s_i32m1_i32(__riscv_vredsum_vs_i32m2_i32m1(acc1, one, 16));
|
||||
int sumi2 = __riscv_vmv_x_s_i32m1_i32(__riscv_vredsum_vs_i32m2_i32m1(acc2, one, 16));
|
||||
sumf += y[i].d * GGML_CPU_FP16_TO_FP32(scale.f16) * (sumi1 + IQ1M_DELTA * sumi2);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
#if defined __riscv_v_intrinsic
|
||||
switch (__riscv_vlenb() * 8) {
|
||||
case 256:
|
||||
ggml_vec_dot_iq1_m_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
|
||||
break;
|
||||
default:
|
||||
ggml_vec_dot_iq1_m_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
break;
|
||||
}
|
||||
#else
|
||||
ggml_vec_dot_iq1_m_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1,333 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
typedef vector unsigned char vec_t;
|
||||
typedef __vector_quad acc_t;
|
||||
|
||||
template <typename TA>
|
||||
class tinyBLAS_Q0_PPC {
|
||||
public:
|
||||
tinyBLAS_Q0_PPC(int64_t k,
|
||||
const TA *A, int64_t lda,
|
||||
const block_q8_0 *B, int64_t ldb,
|
||||
float *C, int64_t ldc,
|
||||
int ith, int nth);
|
||||
|
||||
void matmul(int64_t m, int64_t n);
|
||||
void matmul_tiled_q0(int64_t m, int64_t n, int64_t mc, int64_t nc, int64_t kc) {
|
||||
vec_t A_pack[mc*kc*2];
|
||||
vec_t B_pack[nc*kc*2];
|
||||
int comparray[mc*kc];
|
||||
constexpr bool is_Ablock_q4 = std::is_same_v<TA, block_q4_0>;
|
||||
int64_t ytiles = m / mc;
|
||||
int64_t xtiles = n / nc;
|
||||
int64_t tiles = xtiles * ytiles;
|
||||
int64_t duty = (tiles + nth - 1) / nth;
|
||||
int64_t start = duty * ith;
|
||||
int64_t end = start + duty;
|
||||
if (end > tiles) {
|
||||
end = tiles;
|
||||
}
|
||||
for (int64_t job = start; job < end; ++job) {
|
||||
int64_t ii = (job / xtiles) * mc;
|
||||
int64_t jj = (job % xtiles) * nc;
|
||||
for (int64_t kk = 0; kk < k; kk += kc) {
|
||||
if constexpr(is_Ablock_q4) {
|
||||
packNormalInt4_large(A + ii*lda + kk, lda, mc, 4, (int8_t*)A_pack, comparray);
|
||||
} else {
|
||||
packNormal_large<int8_t, vector signed char>(A + ii*lda + kk, lda, mc, 8, (int8_t*)A_pack, false, comparray);
|
||||
}
|
||||
packNormal_large<uint8_t, vector unsigned char>(B + jj*ldb + kk, ldb, nc, 8, (uint8_t*)B_pack, true);
|
||||
KERNEL_Q0(ii, jj, mc, nc, kc, kk, A_pack, B_pack, comparray);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
inline void save_res(int ii, int jj, int idx, vector float* fin_res, int RM=4, int RN=4) {
|
||||
for (int I = 0; I < RM; I++) {
|
||||
for (int J = 0; J < RN; J++) {
|
||||
*((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&fin_res[idx+I]+J);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void add_save_res(int ii, int jj, int idx, vector float* fin_res, int RM=4, int RN=4) {
|
||||
for (int I = 0; I < RM; I++) {
|
||||
for (int J = 0; J < RN; J++) {
|
||||
float * c_ptr = (float *)(C+ii+((jj+J)*ldc)+I);
|
||||
*c_ptr += *((float*)&fin_res[idx+I]+J);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename ArrayType>
|
||||
inline void compute(acc_t* ACC, int c_idx, int s_idx, ArrayType& comparray, vector float* vs, vector float* fin_res) {
|
||||
vector signed int vec_C[4];
|
||||
vector float CA[4] = {0};
|
||||
vector float res[4] = {0};
|
||||
__builtin_mma_disassemble_acc(vec_C, ACC);
|
||||
for (int i = 0; i < 4; i++) {
|
||||
CA[i] = vec_splats((float)(((double)comparray[c_idx+i]) * -128.0));
|
||||
res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
|
||||
fin_res[s_idx+i] = vec_madd(res[i], vs[s_idx+i], fin_res[s_idx+i]);
|
||||
}
|
||||
}
|
||||
|
||||
inline void process_q4_elements(vector signed char (&c)[2], int* ca) {
|
||||
const vector signed char lowMask = vec_splats((signed char)0xF);
|
||||
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
||||
const vector signed char v8 = vec_splats((signed char)0x8);
|
||||
vector signed int vsum = {0};
|
||||
vector signed int vsum2 = {0};
|
||||
c[0] = vec_and(c[1], lowMask);
|
||||
c[1] = vec_sr(c[1], v4);
|
||||
c[0] = vec_sub(c[0], v8);
|
||||
c[1] = vec_sub(c[1], v8);
|
||||
vsum = vec_sum4s(c[0], vsum);
|
||||
vsum2 = vec_sum4s(c[1], vsum2);
|
||||
vsum = vec_add(vsum, vsum2);
|
||||
*(ca) = vsum[0] + vsum[1] + vsum[2] + vsum[3];
|
||||
}
|
||||
|
||||
template <typename V1, typename V2>
|
||||
inline void vector_permute_store(V2 &s1, V2 &s2, V2 &s3, V2 &s4, V1 *vecOffset, bool flip) {
|
||||
vector unsigned char swiz1 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
|
||||
vector unsigned char swiz2 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
|
||||
vector unsigned char swiz3 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27};
|
||||
vector unsigned char swiz4 = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31};
|
||||
V2 t1, t2, t3, t4, t5, t6, t7, t8;
|
||||
vector unsigned char xor_vector;
|
||||
uint8_t flip_vec = 0x80;
|
||||
xor_vector = vec_splats(flip_vec);
|
||||
t1 = vec_perm(s1, s2, swiz1);
|
||||
t2 = vec_perm(s1, s2, swiz2);
|
||||
t3 = vec_perm(s3, s4, swiz1);
|
||||
t4 = vec_perm(s3, s4, swiz2);
|
||||
t5 = vec_perm(t1, t3, swiz3);
|
||||
t6 = vec_perm(t1, t3, swiz4);
|
||||
t7 = vec_perm(t2, t4, swiz3);
|
||||
t8 = vec_perm(t2, t4, swiz4);
|
||||
if (flip == true) {
|
||||
t5 = vec_xor(t5, xor_vector);
|
||||
t6 = vec_xor(t6, xor_vector);
|
||||
t7 = vec_xor(t7, xor_vector);
|
||||
t8 = vec_xor(t8, xor_vector);
|
||||
}
|
||||
vec_xst(t5, 0, vecOffset);
|
||||
vec_xst(t6, 0, vecOffset+16);
|
||||
vec_xst(t7, 0, vecOffset+32);
|
||||
vec_xst(t8, 0, vecOffset+48);
|
||||
}
|
||||
|
||||
template<int RM, int RN>
|
||||
inline void kernel(int64_t ii, int64_t jj) {
|
||||
if constexpr(RM == 4 && RN == 8) {
|
||||
KERNEL_4x8(ii,jj);
|
||||
} else if constexpr(RM == 8 && RN == 4) {
|
||||
KERNEL_8x4(ii,jj);
|
||||
} else if constexpr(RM == 8 && RN == 8) {
|
||||
KERNEL_8x8(ii,jj);
|
||||
} else {
|
||||
assert(false && "RN/RM values not supported");
|
||||
}
|
||||
}
|
||||
template<int size>
|
||||
void packNormalInt4(const TA* a, int64_t lda, int rows, int cols, int8_t* vec, std::array<int, size>& comparray);
|
||||
template<typename VA, typename VB>
|
||||
void packNormal(const block_q8_0* a, int64_t lda, int rows, int cols, VA* vec, bool flip);
|
||||
void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n);
|
||||
void KERNEL_4x8(int64_t ii, int64_t jj);
|
||||
void KERNEL_8x4(int64_t ii, int64_t jj);
|
||||
void KERNEL_8x8(int64_t ii, int64_t jj);
|
||||
void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n, int RM, int RN);
|
||||
template <int RM, int RN>
|
||||
void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n);
|
||||
|
||||
void compute_scale(int64_t ii, int64_t jj, int blk, vector float* vs){
|
||||
for (int I = 0; I<8; I++) {
|
||||
float a_scale = unhalf((A+((ii+I)*lda)+blk)->d);
|
||||
for (int J = 0; J<4; J++) {
|
||||
*((float*)&vs[I]+J) = (a_scale * unhalf((B+((jj+J)*ldb)+blk)->d));
|
||||
*((float*)&vs[I+8]+J) = (a_scale * unhalf((B+((jj+J+4)*ldb)+blk)->d));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void process_q8_elements(const int8_t *qs, int *ca) {
|
||||
vector signed char c1 = vec_xl(0, qs);
|
||||
vector signed char c2 = vec_xl(16, qs);
|
||||
vector signed int vsum1 = {0};
|
||||
vector signed int vsum2 = {0};
|
||||
vsum1 = vec_sum4s(c1, vsum1);
|
||||
vsum2 = vec_sum4s(c2, vsum2);
|
||||
vector signed int vsum = vec_add(vsum1, vsum2);
|
||||
*ca = vsum[0] + vsum[1] + vsum[2] + vsum[3];
|
||||
}
|
||||
|
||||
template<typename VA, typename VB>
|
||||
void packNormal_large(const block_q8_0* a, int64_t lda, int rows, int cols, VA* vec, bool flip, int* comparray=nullptr) {
|
||||
int64_t i, j;
|
||||
block_q8_0 *aoffset = NULL;
|
||||
VA *vecOffset = NULL;
|
||||
block_q8_0* aoffsets[8];
|
||||
__vector_pair arr[8];
|
||||
VB c[8][2] = {0};
|
||||
VB c1[8] = {0}; VB c2[8] = {0};
|
||||
aoffset = const_cast<block_q8_0*>(a);
|
||||
vecOffset = vec;
|
||||
j = (rows >> 3);
|
||||
int index = 0;
|
||||
if (j > 0) {
|
||||
do {
|
||||
for (int it = 0; it < 8; it++)
|
||||
aoffsets[it] = aoffset + it*lda;
|
||||
aoffset += 8 * lda;
|
||||
for (int blk = 0; blk < kc; blk++) {
|
||||
for (int it = 0; it < 8; it++) {
|
||||
arr[it] = __builtin_vsx_lxvp(0, (__vector_pair*)(aoffsets[it]+blk)->qs);
|
||||
__builtin_vsx_disassemble_pair(c[it], &arr[it]);
|
||||
c1[it] = c[it][0];
|
||||
c2[it] = c[it][1];
|
||||
if (comparray){
|
||||
process_q8_elements((aoffsets[it]+ blk)->qs, &comparray[index + 8*blk + it]);
|
||||
}
|
||||
}
|
||||
vector_permute_store<VA, VB>(c1[0], c1[1], c1[2], c1[3], vecOffset, flip);
|
||||
vector_permute_store<VA, VB>(c2[0], c2[1], c2[2], c2[3], vecOffset+64, flip);
|
||||
vector_permute_store<VA, VB>(c1[4], c1[5], c1[6], c1[7], vecOffset+128, flip);
|
||||
vector_permute_store<VA, VB>(c2[4], c2[5], c2[6], c2[7], vecOffset+192, flip);
|
||||
vecOffset += 256;
|
||||
}
|
||||
j--;
|
||||
index += 8*kc;
|
||||
} while(j > 0);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void packNormalInt4_large(const TA* a, int64_t lda, int rows, int cols, int8_t* vec, int*comparray) {
|
||||
int64_t i, j;
|
||||
TA *aoffset = NULL;
|
||||
int8_t *vecOffset = NULL;
|
||||
TA *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
|
||||
TA *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
|
||||
vector signed char c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2] = {0};
|
||||
vector signed char c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2] = {0};
|
||||
aoffset = const_cast<TA*>(a);
|
||||
vecOffset = vec;
|
||||
int index = 0;
|
||||
j = (rows >> 3);
|
||||
if (j > 0) {
|
||||
do {
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset1 + lda;
|
||||
aoffset3 = aoffset2 + lda;
|
||||
aoffset4 = aoffset3 + lda;
|
||||
aoffset5 = aoffset4 + lda;
|
||||
aoffset6 = aoffset5 + lda;
|
||||
aoffset7 = aoffset6 + lda;
|
||||
aoffset8 = aoffset7 + lda;
|
||||
aoffset += 8 * lda;
|
||||
for (int blk = 0; blk < kc; blk++) {
|
||||
c1[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset1+blk)->qs));
|
||||
c2[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset2+blk)->qs));
|
||||
c3[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset3+blk)->qs));
|
||||
c4[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset4+blk)->qs));
|
||||
c5[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset5+blk)->qs));
|
||||
c6[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset6+blk)->qs));
|
||||
c7[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset7+blk)->qs));
|
||||
c8[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset8+blk)->qs));
|
||||
|
||||
process_q4_elements(c1, &comparray[index + 8*blk+0]);
|
||||
process_q4_elements(c2, &comparray[index + 8*blk+1]);
|
||||
process_q4_elements(c3, &comparray[index + 8*blk+2]);
|
||||
process_q4_elements(c4, &comparray[index + 8*blk+3]);
|
||||
process_q4_elements(c5, &comparray[index + 8*blk+4]);
|
||||
process_q4_elements(c6, &comparray[index + 8*blk+5]);
|
||||
process_q4_elements(c7, &comparray[index + 8*blk+6]);
|
||||
process_q4_elements(c8, &comparray[index + 8*blk+7]);
|
||||
vector_permute_store<int8_t, vector signed char>(c1[0], c2[0], c3[0], c4[0], vecOffset, false);
|
||||
vector_permute_store<int8_t, vector signed char>(c1[1], c2[1], c3[1], c4[1], vecOffset+64, false);
|
||||
vector_permute_store<int8_t, vector signed char>(c5[0], c6[0], c7[0], c8[0], vecOffset+128, false);
|
||||
vector_permute_store<int8_t, vector signed char>(c5[1], c6[1], c7[1], c8[1], vecOffset+192, false);
|
||||
vecOffset += 256;
|
||||
}
|
||||
j--;
|
||||
index += 8*kc;
|
||||
} while (j > 0);
|
||||
}
|
||||
}
|
||||
|
||||
void KERNEL_Q0(int64_t ii, int64_t jj, int64_t mc, int64_t nc, int64_t kc, int64_t l, vec_t *vec_A, vec_t *vec_B, int *comparray) {
|
||||
acc_t acc[8];
|
||||
for (int i = 0; i < mc ; i += 8) {
|
||||
for (int j = 0; j < nc; j += 8) {
|
||||
vector float fin_res[16] = {0};
|
||||
vector float vs[16] = {0};
|
||||
for (int64_t kk = 0; kk < kc; kk+=2) {
|
||||
for (int x = 0; x < 8; x++) {
|
||||
__builtin_mma_xxsetaccz(&acc[x]);
|
||||
}
|
||||
int A_block_idx = (i/8)*(16*kc) + kk*16;
|
||||
int B_block_idx = (j/8)*(16*kc)+ kk*16;
|
||||
vec_t *A_block = &vec_A[A_block_idx];
|
||||
vec_t *B_block = &vec_B[B_block_idx];
|
||||
for (int x = 0; x < 8; x++) {
|
||||
__builtin_mma_xvi8ger4pp(&acc[0], A_block[x], B_block[x]);
|
||||
__builtin_mma_xvi8ger4pp(&acc[1], A_block[x + 8], B_block[x]);
|
||||
__builtin_mma_xvi8ger4pp(&acc[2], A_block[x], B_block[x+8]);
|
||||
__builtin_mma_xvi8ger4pp(&acc[3], A_block[x+8], B_block[x+8]);
|
||||
}
|
||||
compute_scale(ii+i, jj+j, l+kk, vs);
|
||||
int c_index = (i/8)*(8*kc)+ kk*8;
|
||||
int* c_block = &comparray[c_index];
|
||||
compute(&acc[0], 0, 0, c_block, vs, fin_res);
|
||||
compute(&acc[1], 4, 4, c_block, vs, fin_res);
|
||||
compute(&acc[2], 0, 8, c_block, vs, fin_res);
|
||||
compute(&acc[3], 4, 12, c_block, vs, fin_res);
|
||||
|
||||
A_block_idx = (i/8)*(16*kc) + (kk+1)*16;
|
||||
B_block_idx = (j/8)*(16*kc)+ (kk+1)*16;
|
||||
A_block = &vec_A[A_block_idx];
|
||||
B_block = &vec_B[B_block_idx];
|
||||
for (int x = 0; x < 8; x++) {
|
||||
__builtin_mma_xvi8ger4pp(&acc[4], A_block[x], B_block[x]);
|
||||
__builtin_mma_xvi8ger4pp(&acc[5], A_block[x + 8], B_block[x]);
|
||||
__builtin_mma_xvi8ger4pp(&acc[6], A_block[x], B_block[x+8]);
|
||||
__builtin_mma_xvi8ger4pp(&acc[7], A_block[x+8], B_block[x+8]);
|
||||
}
|
||||
compute_scale(ii+i, jj+j, l+kk+1, vs);
|
||||
c_index = (i/8)*(8*kc)+ (kk+1)*8;
|
||||
c_block = &comparray[c_index];
|
||||
compute(&acc[4], 0, 0, c_block, vs, fin_res);
|
||||
compute(&acc[5], 4, 4, c_block, vs, fin_res);
|
||||
compute(&acc[6], 0, 8, c_block, vs, fin_res);
|
||||
compute(&acc[7], 4, 12, c_block, vs, fin_res);
|
||||
|
||||
}
|
||||
if (l == 0) {
|
||||
save_res(ii+i, jj+j, 0, fin_res);
|
||||
save_res(ii+i+4, jj+j, 4, fin_res);
|
||||
save_res(ii+i, jj+j+4, 8, fin_res);
|
||||
save_res(ii+i+4, jj+j+4, 12, fin_res);
|
||||
} else {
|
||||
add_save_res(ii+i, jj+j, 0, fin_res);
|
||||
add_save_res(ii+i+4, jj+j, 4, fin_res);
|
||||
add_save_res(ii+i, jj+j+4, 8, fin_res);
|
||||
add_save_res(ii+i+4, jj+j+4, 12, fin_res);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const TA *const A;
|
||||
const block_q8_0 *const B;
|
||||
float *C;
|
||||
const int64_t k;
|
||||
int64_t kc;
|
||||
const int64_t lda;
|
||||
const int64_t ldb;
|
||||
const int64_t ldc;
|
||||
const int ith;
|
||||
const int nth;
|
||||
};
|
||||
@@ -121,7 +121,8 @@ inline float32x4_t mul(float32x4_t x, float32x4_t y) { return vec_mul(x, y); }
|
||||
#endif
|
||||
|
||||
#if defined(__MMA__)
|
||||
#include "sgemm-ppc.h"
|
||||
typedef vector unsigned char vec_t;
|
||||
typedef __vector_quad acc_t;
|
||||
#endif
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// VECTORIZED FUSED MULTIPLY ADD
|
||||
@@ -2153,7 +2154,7 @@ class tinyBLAS_HP16_PPC {
|
||||
packNormal((B+(jj*ldb)+l), ldb, 8, 4, (uint8_t*)vec_B);
|
||||
for (int x = 0; x < 4; x++) {
|
||||
mma_instr<TA>::outer_product(&acc_0, vec_A[x], vec_B[x]);
|
||||
mma_instr<TA>::outer_product(&acc_1, vec_A[x], vec_B[x+4]);
|
||||
mma_instr<TA>::outer_product(&acc_1, vec_A[x+4], vec_B[x]);
|
||||
}
|
||||
}
|
||||
SAVE_ACC(&acc_0, ii, jj);
|
||||
@@ -2301,43 +2302,299 @@ class tinyBLAS_HP16_PPC {
|
||||
const int nth;
|
||||
};
|
||||
|
||||
template <typename TA>
|
||||
tinyBLAS_Q0_PPC<TA>::tinyBLAS_Q0_PPC(int64_t k,
|
||||
const TA *A, int64_t lda,
|
||||
const block_q8_0 *B, int64_t ldb,
|
||||
float *C, int64_t ldc,
|
||||
int ith, int nth)
|
||||
template <typename TA>
|
||||
class tinyBLAS_Q0_PPC {
|
||||
public:
|
||||
tinyBLAS_Q0_PPC(int64_t k,
|
||||
const TA * A, int64_t lda,
|
||||
const block_q8_0 * B, int64_t ldb,
|
||||
float * C, int64_t ldc,
|
||||
int ith, int nth)
|
||||
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
||||
kc = 64;
|
||||
}
|
||||
|
||||
template<typename TA>
|
||||
void tinyBLAS_Q0_PPC<TA>::matmul(int64_t m, int64_t n) {
|
||||
int mc = 64; int nc = 64;
|
||||
if (n % 8 == 0 && n < nc) {
|
||||
nc = n;
|
||||
mc = 32 ;
|
||||
kc = 32;
|
||||
void matmul(int64_t m, int64_t n) {
|
||||
const int64_t mc = 64;
|
||||
const int64_t kc = 64;
|
||||
int64_t nc = 64;
|
||||
int64_t n_aligned = 0;
|
||||
if (n % 64 == 0) {
|
||||
n_aligned = n;
|
||||
} else if (n == 4) {
|
||||
n_aligned = 4;
|
||||
} else if (n < 64) {
|
||||
n_aligned = (n / 8) * 8;
|
||||
} else {
|
||||
n_aligned = (n / 64) * 64;
|
||||
}
|
||||
const bool is_aligned = ((m & (mc - 1)) == 0) & ((n & (nc - 1)) == 0) & ((k & (kc - 1)) == 0);
|
||||
if (is_aligned) {
|
||||
this->matmul_tiled_q0(m, n, mc, nc, kc);
|
||||
|
||||
if (n_aligned > 0) {
|
||||
if (n_aligned % 64 == 0) nc = 64;
|
||||
else if (n_aligned == n) nc = n;
|
||||
else if (n_aligned % 32 == 0) nc = 32;
|
||||
else if (n_aligned % 24 == 0) nc = 24;
|
||||
else if (n_aligned % 16 == 0) nc = 16;
|
||||
else nc = 8;
|
||||
}
|
||||
bool can_use_tiled = n_aligned > 0 && (m % mc == 0) && (k % kc == 0);
|
||||
if (can_use_tiled) {
|
||||
matmul_tiled(m, n_aligned, mc, nc, kc);
|
||||
if (n > n_aligned) {
|
||||
mnpack(0, m, n_aligned, n);
|
||||
}
|
||||
} else {
|
||||
mnpack(0, m, 0, n);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename TA>
|
||||
template<int size>
|
||||
void tinyBLAS_Q0_PPC<TA>::packNormalInt4(const TA* a, int64_t lda, int rows, int cols, int8_t* vec, std::array<int, size>& comparray) {
|
||||
private:
|
||||
inline void save_res(int ii, int jj, int idx, vector float * fin_res, int RM = 4, int RN = 4) {
|
||||
for (int I = 0; I < RM; I++) {
|
||||
for (int J = 0; J < RN; J++) {
|
||||
*((float *)(C + ii + ((jj + J) * ldc) + I)) = *((float *)&fin_res[idx + I] + J);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void save_acc(acc_t * ACC, int64_t ii, int64_t jj) {
|
||||
vec_t vec_C[4];
|
||||
__builtin_mma_disassemble_acc(vec_C, ACC);
|
||||
for (int I = 0; I < 4; I++) {
|
||||
for (int J = 0; J < 4; J++) {
|
||||
*((float *)(C + ii + ((jj + J) * ldc) + I)) = *((float *)&vec_C[I] + J);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void add_save_acc(acc_t * ACC, int64_t ii, int64_t jj) {
|
||||
vec_t vec_C[4];
|
||||
__builtin_mma_disassemble_acc(vec_C, ACC);
|
||||
for (int I = 0; I < 4; I++) {
|
||||
for (int J = 0; J < 4; J++) {
|
||||
float * c_ptr = (float *)(C + ii+ ((jj + J) * ldc) + I);
|
||||
*c_ptr += *((float *)&vec_C[I] + J);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename ArrayType>
|
||||
inline void compute(acc_t * ACC, int c_idx, int s_idx, ArrayType & comparray, vector float * vs, vector float * fin_res) {
|
||||
vector signed int vec_C[4];
|
||||
vector float CA[4] = {0};
|
||||
vector float res[4] = {0};
|
||||
__builtin_mma_disassemble_acc(vec_C, ACC);
|
||||
for (int i = 0; i < 4; i++) {
|
||||
CA[i] = vec_splats((float)(((double)comparray[c_idx + i]) * -128.0));
|
||||
res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
|
||||
fin_res[s_idx + i] = vec_madd(res[i], vs[s_idx + i], fin_res[s_idx + i]);
|
||||
}
|
||||
}
|
||||
|
||||
inline void process_q4_elements(vector signed char (&c)[2], int * ca) {
|
||||
const vector signed char lowMask = vec_splats((signed char)0xF);
|
||||
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
||||
const vector signed char v8 = vec_splats((signed char)0x8);
|
||||
vector signed int vsum = {0};
|
||||
vector signed int vsum2 = {0};
|
||||
c[0] = vec_and(c[1], lowMask);
|
||||
c[1] = vec_sr(c[1], v4);
|
||||
c[0] = vec_sub(c[0], v8);
|
||||
c[1] = vec_sub(c[1], v8);
|
||||
vsum = vec_sum4s(c[0], vsum);
|
||||
vsum2 = vec_sum4s(c[1], vsum2);
|
||||
vsum = vec_add(vsum, vsum2);
|
||||
*(ca) = vsum[0] + vsum[1] + vsum[2] + vsum[3];
|
||||
}
|
||||
|
||||
template <typename V1, typename V2>
|
||||
inline void vector_permute_store(V2 & s1, V2 & s2, V2 & s3, V2 & s4, V1 * vecOffset, bool flip) {
|
||||
vector unsigned char swiz1 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
|
||||
vector unsigned char swiz2 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
|
||||
vector unsigned char swiz3 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27};
|
||||
vector unsigned char swiz4 = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31};
|
||||
V2 t1, t2, t3, t4, t5, t6, t7, t8;
|
||||
vector unsigned char xor_vector;
|
||||
uint8_t flip_vec = 0x80;
|
||||
xor_vector = vec_splats(flip_vec);
|
||||
t1 = vec_perm(s1, s2, swiz1);
|
||||
t2 = vec_perm(s1, s2, swiz2);
|
||||
t3 = vec_perm(s3, s4, swiz1);
|
||||
t4 = vec_perm(s3, s4, swiz2);
|
||||
t5 = vec_perm(t1, t3, swiz3);
|
||||
t6 = vec_perm(t1, t3, swiz4);
|
||||
t7 = vec_perm(t2, t4, swiz3);
|
||||
t8 = vec_perm(t2, t4, swiz4);
|
||||
if (flip == true) {
|
||||
t5 = vec_xor(t5, xor_vector);
|
||||
t6 = vec_xor(t6, xor_vector);
|
||||
t7 = vec_xor(t7, xor_vector);
|
||||
t8 = vec_xor(t8, xor_vector);
|
||||
}
|
||||
vec_xst(t5, 0, vecOffset);
|
||||
vec_xst(t6, 0, vecOffset + 16);
|
||||
vec_xst(t7, 0, vecOffset + 32);
|
||||
vec_xst(t8, 0, vecOffset + 48);
|
||||
}
|
||||
|
||||
inline void unpack_q4_to_q8(vector signed char packed, vector signed char & lo, vector signed char & hi) {
|
||||
const vector signed char lowMask = vec_splats((signed char)0x0F);
|
||||
const vector signed char v8 = vec_splats((signed char)0x08);
|
||||
const vector unsigned char v4 = vec_splats((unsigned char)4);
|
||||
lo = vec_and(packed, lowMask);
|
||||
hi = vec_sr(packed, v4);
|
||||
lo = vec_sub(lo, v8);
|
||||
hi = vec_sub(hi, v8);
|
||||
}
|
||||
|
||||
inline void vector_permute_store_fp16(vec_t * c, unsigned char * vecOffset) {
|
||||
vec_t t[8], s[8];
|
||||
vec_t swiz1 = {0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23};
|
||||
vec_t swiz2 = {8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31};
|
||||
vec_t swiz3 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
|
||||
vec_t swiz4 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
|
||||
for (int i = 0; i < 4; i += 2) {
|
||||
t[i + 0] = vec_perm(c[i + 0], c[i + 1], swiz1);
|
||||
t[i + 1] = vec_perm(c[i + 0], c[i + 1], swiz2);
|
||||
}
|
||||
for (int i = 4; i < 8; i += 2) {
|
||||
t[i + 0] = vec_perm(c[i + 0], c[i + 1], swiz1);
|
||||
t[i + 1] = vec_perm(c[i + 0], c[i + 1], swiz2);
|
||||
}
|
||||
s[0] = vec_perm(t[0], t[2], swiz3);
|
||||
s[1] = vec_perm(t[0], t[2], swiz4);
|
||||
s[2] = vec_perm(t[1], t[3], swiz3);
|
||||
s[3] = vec_perm(t[1], t[3], swiz4);
|
||||
s[4] = vec_perm(t[4], t[6], swiz3);
|
||||
s[5] = vec_perm(t[4], t[6], swiz4);
|
||||
s[6] = vec_perm(t[5], t[7], swiz3);
|
||||
s[7] = vec_perm(t[5], t[7], swiz4);
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
vec_xst(s[i], 0, (vec_t *)(vecOffset + i * 16));
|
||||
}
|
||||
}
|
||||
|
||||
static inline void convert_and_scale_q8(vector signed char raw, vector float v_scale, vector unsigned short & out_hi, vector unsigned short & out_lo) {
|
||||
vector signed short i16_hi = vec_unpackh(raw);
|
||||
vector signed short i16_lo = vec_unpackl(raw);
|
||||
|
||||
vector float f_hi_h = vec_ctf(vec_unpackh(i16_hi), 0);
|
||||
vector float f_hi_l = vec_ctf(vec_unpackl(i16_hi), 0);
|
||||
vector float f_lo_h = vec_ctf(vec_unpackh(i16_lo), 0);
|
||||
vector float f_lo_l = vec_ctf(vec_unpackl(i16_lo), 0);
|
||||
out_hi = vec_pack_to_short_fp32(vec_mul(f_hi_h, v_scale), vec_mul(f_hi_l, v_scale));
|
||||
out_lo = vec_pack_to_short_fp32(vec_mul(f_lo_h, v_scale), vec_mul(f_lo_l, v_scale));
|
||||
}
|
||||
|
||||
void packNormal_q4_fp16(const block_q4_0 * a, int64_t lda, int rows, int blocks, unsigned char * vec) {
|
||||
unsigned char * vecOffset = vec;
|
||||
for (int i = 0; i < rows; i += 8) {
|
||||
const block_q4_0 * rows_base[8];
|
||||
for (int r = 0; r < 8; r++) {
|
||||
rows_base[r] = a + (i + r) * lda;
|
||||
}
|
||||
for (int blk = 0; blk < blocks; blk++) {
|
||||
vector unsigned short hp_res[8][4];
|
||||
for (int r = 0; r < 8; r++) {
|
||||
const block_q4_0 * current_blk = rows_base[r] + blk;
|
||||
vector float v_scale = vec_extract_fp32_from_shorth(vec_splats(current_blk->d));
|
||||
vector signed char v_qs = reinterpret_cast<vector signed char>(vec_xl(0, current_blk->qs));
|
||||
vector signed char c1, c2;
|
||||
unpack_q4_to_q8(v_qs, c1, c2);
|
||||
convert_and_scale_q8(c1, v_scale, hp_res[r][0], hp_res[r][1]);
|
||||
convert_and_scale_q8(c2, v_scale, hp_res[r][2], hp_res[r][3]);
|
||||
}
|
||||
for (int c = 0; c < 4; c++) {
|
||||
vector unsigned char c_arr[8];
|
||||
for (int r = 0; r < 8; r++) {
|
||||
c_arr[r] = (vector unsigned char)hp_res[r][c];
|
||||
}
|
||||
vector_permute_store_fp16((vec_t *)c_arr, vecOffset);
|
||||
vecOffset += 128;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int chunk_size>
|
||||
static inline void pack_q8_block(const block_q8_0 * a, int64_t lda, int rows, int blocks, unsigned char * vec) {
|
||||
unsigned char * vecOffset = vec;
|
||||
const vec_t swiz1 = {0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23};
|
||||
const vec_t swiz2 = {8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31};
|
||||
const vec_t swiz3 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
|
||||
const vec_t swiz4 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
|
||||
|
||||
for (int i = 0; i < rows; i += chunk_size) {
|
||||
const block_q8_0 * rows_base[chunk_size];
|
||||
for (int r = 0; r < chunk_size; r++) {
|
||||
rows_base[r] = a + (i + r) * lda;
|
||||
}
|
||||
for (int blk = 0; blk < blocks; blk++) {
|
||||
vector unsigned short hp_res[chunk_size][4];
|
||||
for (int r = 0; r < chunk_size; r++) {
|
||||
const block_q8_0 * b = rows_base[r] + blk;
|
||||
vector float v_scale = vec_extract_fp32_from_shorth(vec_splats(b->d));
|
||||
vector signed char c[2];
|
||||
__vector_pair pair = __builtin_vsx_lxvp(0, (__vector_pair *)b->qs);
|
||||
__builtin_vsx_disassemble_pair(c, & pair);
|
||||
convert_and_scale_q8(c[0], v_scale, hp_res[r][0], hp_res[r][1]);
|
||||
convert_and_scale_q8(c[1], v_scale, hp_res[r][2], hp_res[r][3]);
|
||||
}
|
||||
for (int col = 0; col < 4; col++) {
|
||||
if constexpr (chunk_size == 8) {
|
||||
vec_t t[8];
|
||||
t[0] = vec_perm((vec_t)hp_res[0][col], (vec_t)hp_res[1][col], swiz1);
|
||||
t[1] = vec_perm((vec_t)hp_res[0][col], (vec_t)hp_res[1][col], swiz2);
|
||||
t[2] = vec_perm((vec_t)hp_res[2][col], (vec_t)hp_res[3][col], swiz1);
|
||||
t[3] = vec_perm((vec_t)hp_res[2][col], (vec_t)hp_res[3][col], swiz2);
|
||||
t[4] = vec_perm((vec_t)hp_res[4][col], (vec_t)hp_res[5][col], swiz1);
|
||||
t[5] = vec_perm((vec_t)hp_res[4][col], (vec_t)hp_res[5][col], swiz2);
|
||||
t[6] = vec_perm((vec_t)hp_res[6][col], (vec_t)hp_res[7][col], swiz1);
|
||||
t[7] = vec_perm((vec_t)hp_res[6][col], (vec_t)hp_res[7][col], swiz2);
|
||||
|
||||
vec_xst(vec_perm(t[0], t[2], swiz3), 0, (vec_t *)(vecOffset + 0));
|
||||
vec_xst(vec_perm(t[0], t[2], swiz4), 0, (vec_t *)(vecOffset + 16));
|
||||
vec_xst(vec_perm(t[1], t[3], swiz3), 0, (vec_t *)(vecOffset + 32));
|
||||
vec_xst(vec_perm(t[1], t[3], swiz4), 0, (vec_t *)(vecOffset + 48));
|
||||
vec_xst(vec_perm(t[4], t[6], swiz3), 0, (vec_t *)(vecOffset + 64));
|
||||
vec_xst(vec_perm(t[4], t[6], swiz4), 0, (vec_t *)(vecOffset + 80));
|
||||
vec_xst(vec_perm(t[5], t[7], swiz3), 0, (vec_t *)(vecOffset + 96));
|
||||
vec_xst(vec_perm(t[5], t[7], swiz4), 0, (vec_t *)(vecOffset + 112));
|
||||
vecOffset += 128;
|
||||
} else {
|
||||
vec_t t0 = vec_perm((vec_t)hp_res[0][col], (vec_t)hp_res[1][col], swiz1);
|
||||
vec_t t1 = vec_perm((vec_t)hp_res[0][col], (vec_t)hp_res[1][col], swiz2);
|
||||
vec_t t2 = vec_perm((vec_t)hp_res[2][col], (vec_t)hp_res[3][col], swiz1);
|
||||
vec_t t3 = vec_perm((vec_t)hp_res[2][col], (vec_t)hp_res[3][col], swiz2);
|
||||
|
||||
vec_xst(vec_perm(t0, t2, swiz3), 0, (vec_t *)(vecOffset + 0));
|
||||
vec_xst(vec_perm(t0, t2, swiz4), 0, (vec_t *)(vecOffset + 16));
|
||||
vec_xst(vec_perm(t1, t3, swiz3), 0, (vec_t *)(vecOffset + 32));
|
||||
vec_xst(vec_perm(t1, t3, swiz4), 0, (vec_t *)(vecOffset + 48));
|
||||
vecOffset += 64;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void packNormal_q8_fp16(const block_q8_0 * a, int64_t lda, int rows, int blocks, unsigned char * vec) {
|
||||
if (rows == 4) {
|
||||
pack_q8_block<4>(a, lda, rows, blocks, vec);
|
||||
} else {
|
||||
pack_q8_block<8>(a, lda, rows, blocks, vec);
|
||||
}
|
||||
}
|
||||
|
||||
template<int size>
|
||||
void packNormalInt4(const TA * a, int64_t lda, int rows, int cols, int8_t * vec, std::array<int, size> & comparray) {
|
||||
int64_t i, j;
|
||||
TA *aoffset = NULL;
|
||||
int8_t *vecOffset = NULL;
|
||||
TA *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
|
||||
TA *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
|
||||
TA * aoffset = NULL;
|
||||
int8_t * vecOffset = NULL;
|
||||
TA * aoffset1 = NULL, * aoffset2 = NULL, * aoffset3 = NULL, * aoffset4 = NULL;
|
||||
TA * aoffset5 = NULL, * aoffset6 = NULL, * aoffset7 = NULL, * aoffset8 = NULL;
|
||||
vector signed char c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2] = {0};
|
||||
vector signed char c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2] = {0};
|
||||
aoffset = const_cast<TA*>(a);
|
||||
aoffset = const_cast<TA *>(a);
|
||||
vecOffset = vec;
|
||||
j = (rows >> 3);
|
||||
if (j > 0) {
|
||||
@@ -2363,18 +2620,18 @@ class tinyBLAS_HP16_PPC {
|
||||
c7[1] = reinterpret_cast<vector signed char>(vec_xl(0, aoffset7->qs));
|
||||
c8[1] = reinterpret_cast<vector signed char>(vec_xl(0, aoffset8->qs));
|
||||
|
||||
process_q4_elements(c1, &comparray[0]);
|
||||
process_q4_elements(c2, &comparray[1]);
|
||||
process_q4_elements(c3, &comparray[2]);
|
||||
process_q4_elements(c4, &comparray[3]);
|
||||
process_q4_elements(c5, &comparray[4]);
|
||||
process_q4_elements(c6, &comparray[5]);
|
||||
process_q4_elements(c7, &comparray[6]);
|
||||
process_q4_elements(c8, &comparray[7]);
|
||||
process_q4_elements(c1, & comparray[0]);
|
||||
process_q4_elements(c2, & comparray[1]);
|
||||
process_q4_elements(c3, & comparray[2]);
|
||||
process_q4_elements(c4, & comparray[3]);
|
||||
process_q4_elements(c5, & comparray[4]);
|
||||
process_q4_elements(c6, & comparray[5]);
|
||||
process_q4_elements(c7, & comparray[6]);
|
||||
process_q4_elements(c8, & comparray[7]);
|
||||
vector_permute_store<int8_t, vector signed char>(c1[0], c2[0], c3[0], c4[0], vecOffset, false);
|
||||
vector_permute_store<int8_t, vector signed char>(c1[1], c2[1], c3[1], c4[1], vecOffset+64, false);
|
||||
vector_permute_store<int8_t, vector signed char>(c5[0], c6[0], c7[0], c8[0], vecOffset+128, false);
|
||||
vector_permute_store<int8_t, vector signed char>(c5[1], c6[1], c7[1], c8[1], vecOffset+192, false);
|
||||
vector_permute_store<int8_t, vector signed char>(c1[1], c2[1], c3[1], c4[1], vecOffset + 64, false);
|
||||
vector_permute_store<int8_t, vector signed char>(c5[0], c6[0], c7[0], c8[0], vecOffset + 128, false);
|
||||
vector_permute_store<int8_t, vector signed char>(c5[1], c6[1], c7[1], c8[1], vecOffset + 192, false);
|
||||
aoffset1 += lda;
|
||||
aoffset2 += lda;
|
||||
aoffset3 += lda;
|
||||
@@ -2405,12 +2662,12 @@ class tinyBLAS_HP16_PPC {
|
||||
c3[1] = reinterpret_cast<vector signed char>(vec_xl(0, aoffset3->qs));
|
||||
c4[1] = reinterpret_cast<vector signed char>(vec_xl(0, aoffset4->qs));
|
||||
|
||||
process_q4_elements(c1, &comparray[0]);
|
||||
process_q4_elements(c2, &comparray[1]);
|
||||
process_q4_elements(c3, &comparray[2]);
|
||||
process_q4_elements(c4, &comparray[3]);
|
||||
process_q4_elements(c1, & comparray[0]);
|
||||
process_q4_elements(c2, & comparray[1]);
|
||||
process_q4_elements(c3, & comparray[2]);
|
||||
process_q4_elements(c4, & comparray[3]);
|
||||
vector_permute_store<int8_t, vector signed char>(c1[0], c2[0], c3[0], c4[0], vecOffset, false);
|
||||
vector_permute_store<int8_t, vector signed char>(c1[1], c2[1], c3[1], c4[1], vecOffset+64, false);
|
||||
vector_permute_store<int8_t, vector signed char>(c1[1], c2[1], c3[1], c4[1], vecOffset + 64, false);
|
||||
aoffset1 += lda;
|
||||
aoffset2 += lda;
|
||||
aoffset3 += lda;
|
||||
@@ -2434,12 +2691,12 @@ class tinyBLAS_HP16_PPC {
|
||||
case 1: c1[1] = reinterpret_cast<vector signed char>(vec_xl(0, aoffset1->qs));
|
||||
break;
|
||||
}
|
||||
process_q4_elements(c1, &comparray[0]);
|
||||
process_q4_elements(c2, &comparray[1]);
|
||||
process_q4_elements(c3, &comparray[2]);
|
||||
process_q4_elements(c4, &comparray[3]);
|
||||
process_q4_elements(c1, & comparray[0]);
|
||||
process_q4_elements(c2, & comparray[1]);
|
||||
process_q4_elements(c3, & comparray[2]);
|
||||
process_q4_elements(c4, & comparray[3]);
|
||||
vector_permute_store<int8_t, vector signed char>(c1[0], c2[0], c3[0], c4[0], vecOffset, false);
|
||||
vector_permute_store<int8_t, vector signed char>(c1[1], c2[1], c3[1], c4[1], vecOffset+64, false);
|
||||
vector_permute_store<int8_t, vector signed char>(c1[1], c2[1], c3[1], c4[1], vecOffset + 64, false);
|
||||
aoffset1 += lda;
|
||||
aoffset2 += lda;
|
||||
aoffset3 += lda;
|
||||
@@ -2450,39 +2707,38 @@ class tinyBLAS_HP16_PPC {
|
||||
}
|
||||
}
|
||||
|
||||
template<typename TA>
|
||||
template<typename VA, typename VB>
|
||||
void tinyBLAS_Q0_PPC<TA>::packNormal(const block_q8_0* a, int64_t lda, int rows, int cols, VA* vec, bool flip) {
|
||||
void packNormal(const block_q8_0 * a, int64_t lda, int rows, int cols, VA * vec, bool flip) {
|
||||
int64_t i, j;
|
||||
block_q8_0 *aoffset = NULL;
|
||||
VA *vecOffset = NULL;
|
||||
block_q8_0* aoffsets[8];
|
||||
block_q8_0 * aoffset = NULL;
|
||||
VA * vecOffset = NULL;
|
||||
block_q8_0 * aoffsets[8];
|
||||
__vector_pair arr[8];
|
||||
VB c[8][2] = {0};
|
||||
VB c1[8] = {0}; VB c2[8] = {0};
|
||||
aoffset = const_cast<block_q8_0*>(a);
|
||||
aoffset = const_cast<block_q8_0 *>(a);
|
||||
vecOffset = vec;
|
||||
j = (rows >> 3);
|
||||
if (j > 0) {
|
||||
do {
|
||||
aoffsets[0] = aoffset;
|
||||
for (int it = 1; it < 8; it++)
|
||||
aoffsets[it] = aoffsets[it-1] + lda;
|
||||
aoffsets[it] = aoffsets[it - 1] + lda;
|
||||
aoffset += 8 * lda;
|
||||
|
||||
i = (cols >> 3);
|
||||
if (i > 0) {
|
||||
do {
|
||||
for (int it = 0; it < 8; it++) {
|
||||
arr[it] = __builtin_vsx_lxvp(0, (__vector_pair*)aoffsets[it]->qs);
|
||||
__builtin_vsx_disassemble_pair(c[it], &arr[it]);
|
||||
arr[it] = __builtin_vsx_lxvp(0, (__vector_pair *)aoffsets[it]->qs);
|
||||
__builtin_vsx_disassemble_pair(c[it], & arr[it]);
|
||||
c1[it] = c[it][0];
|
||||
c2[it] = c[it][1];
|
||||
}
|
||||
vector_permute_store<VA, VB>(c1[0], c1[1], c1[2], c1[3], vecOffset, flip);
|
||||
vector_permute_store<VA, VB>(c2[0], c2[1], c2[2], c2[3], vecOffset+64, flip);
|
||||
vector_permute_store<VA, VB>(c1[4], c1[5], c1[6], c1[7], vecOffset+128, flip);
|
||||
vector_permute_store<VA, VB>(c2[4], c2[5], c2[6], c2[7], vecOffset+192, flip);
|
||||
vector_permute_store<VA, VB>(c2[0], c2[1], c2[2], c2[3], vecOffset + 64, flip);
|
||||
vector_permute_store<VA, VB>(c1[4], c1[5], c1[6], c1[7], vecOffset + 128, flip);
|
||||
vector_permute_store<VA, VB>(c2[4], c2[5], c2[6], c2[7], vecOffset + 192, flip);
|
||||
for (int it = 0; it < 8; it++)
|
||||
aoffsets[it] += lda;
|
||||
vecOffset += 256;
|
||||
@@ -2501,13 +2757,13 @@ class tinyBLAS_HP16_PPC {
|
||||
if (i > 0) {
|
||||
do {
|
||||
for (int it = 0; it < 4; it++) {
|
||||
arr[it] = __builtin_vsx_lxvp(0, (__vector_pair*)aoffsets[it]->qs);
|
||||
__builtin_vsx_disassemble_pair(c[it], &arr[it]);
|
||||
arr[it] = __builtin_vsx_lxvp(0, (__vector_pair *)aoffsets[it]->qs);
|
||||
__builtin_vsx_disassemble_pair(c[it], & arr[it]);
|
||||
c1[it] = c[it][0];
|
||||
c2[it] = c[it][1];
|
||||
}
|
||||
vector_permute_store<VA, VB>(c1[0], c1[1], c1[2], c1[3], vecOffset, flip);
|
||||
vector_permute_store<VA, VB>(c2[0], c2[1], c2[2], c2[3], vecOffset+64, flip);
|
||||
vector_permute_store<VA, VB>(c2[0], c2[1], c2[2], c2[3], vecOffset + 64, flip);
|
||||
for (int it = 0; it < 4; it++) {
|
||||
aoffsets[it] += lda;
|
||||
}
|
||||
@@ -2520,24 +2776,24 @@ class tinyBLAS_HP16_PPC {
|
||||
if (rows & 3) {
|
||||
aoffsets[0] = aoffset;
|
||||
for (int it = 1; it < 3; it++ )
|
||||
aoffsets[it] = aoffsets[it-1] + lda;
|
||||
aoffsets[it] = aoffsets[it - 1] + lda;
|
||||
i = (cols >> 3);
|
||||
if (i > 0) {
|
||||
do {
|
||||
switch(rows) {
|
||||
case 3: arr[2] = __builtin_vsx_lxvp(0, (__vector_pair*)aoffsets[2]->qs);
|
||||
__builtin_vsx_disassemble_pair(c[2], &arr[2]);
|
||||
case 3: arr[2] = __builtin_vsx_lxvp(0, (__vector_pair *)aoffsets[2]->qs);
|
||||
__builtin_vsx_disassemble_pair(c[2], & arr[2]);
|
||||
c1[2] = c[2][0]; c2[2] = c[2][1];
|
||||
case 2: arr[1] = __builtin_vsx_lxvp(0, (__vector_pair*)aoffsets[1]->qs);
|
||||
__builtin_vsx_disassemble_pair(c[1], &arr[1]);
|
||||
case 2: arr[1] = __builtin_vsx_lxvp(0, (__vector_pair *)aoffsets[1]->qs);
|
||||
__builtin_vsx_disassemble_pair(c[1], & arr[1]);
|
||||
c1[1] = c[1][0]; c2[1] = c[1][1];
|
||||
case 1: arr[0] = __builtin_vsx_lxvp(0, (__vector_pair*)aoffsets[0]->qs);
|
||||
__builtin_vsx_disassemble_pair(c[0], &arr[0]);
|
||||
case 1: arr[0] = __builtin_vsx_lxvp(0, (__vector_pair *)aoffsets[0]->qs);
|
||||
__builtin_vsx_disassemble_pair(c[0], & arr[0]);
|
||||
c1[0] = c[0][0]; c2[0] = c[0][1];
|
||||
break;
|
||||
}
|
||||
vector_permute_store<VA, VB>(c1[0], c1[1], c1[2], c1[3], vecOffset, flip);
|
||||
vector_permute_store<VA, VB>(c2[0], c2[1], c2[2], c2[3], vecOffset+64, flip);
|
||||
vector_permute_store<VA, VB>(c2[0], c2[1], c2[2], c2[3], vecOffset + 64, flip);
|
||||
for (int it = 0; it < 3; it++)
|
||||
aoffsets[it] += lda;
|
||||
vecOffset += 128;
|
||||
@@ -2547,8 +2803,7 @@ class tinyBLAS_HP16_PPC {
|
||||
}
|
||||
}
|
||||
|
||||
template<typename TA>
|
||||
void tinyBLAS_Q0_PPC<TA>::mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||
void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||
int m_rem = MIN(m - m0, 16);
|
||||
int n_rem = MIN(n - n0, 16);
|
||||
|
||||
@@ -2585,8 +2840,7 @@ class tinyBLAS_HP16_PPC {
|
||||
}
|
||||
|
||||
|
||||
template<typename TA>
|
||||
void tinyBLAS_Q0_PPC<TA>::KERNEL_4x8(int64_t ii, int64_t jj) {
|
||||
void KERNEL_4x8(int64_t ii, int64_t jj) {
|
||||
vec_t vec_A[8], vec_B[16] = {0};
|
||||
acc_t acc_0, acc_1;
|
||||
std::array<int, 4> comparray {};
|
||||
@@ -2594,26 +2848,26 @@ class tinyBLAS_HP16_PPC {
|
||||
vector float vs[8] = {0};
|
||||
bool isAblock_q4 = std::is_same_v<TA, block_q4_0>;
|
||||
for (int l = 0; l < k; l++) {
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
__builtin_mma_xxsetaccz(&acc_1);
|
||||
__builtin_mma_xxsetaccz(& acc_0);
|
||||
__builtin_mma_xxsetaccz(& acc_1);
|
||||
if (std::is_same_v<TA, block_q4_0>) {
|
||||
packNormalInt4<4>((A+(ii*lda)+l), lda, 4, 4, (int8_t*)vec_A, comparray);
|
||||
packNormalInt4<4>((A + (ii * lda) + l), lda, 4, 4, (int8_t *)vec_A, comparray);
|
||||
} else {
|
||||
packNormal<int8_t, vector signed char>((const block_q8_0*)(A+(ii*lda)+l), lda, 4, 8, (int8_t*)vec_A, false);
|
||||
packNormal<int8_t, vector signed char>((const block_q8_0 *)(A + (ii * lda) + l), lda, 4, 8, (int8_t *)vec_A, false);
|
||||
}
|
||||
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B, true);
|
||||
packNormal<uint8_t, vector unsigned char>((B + (jj * ldb) + l), ldb, 8, 8, (uint8_t *)vec_B, true);
|
||||
for(int x = 0; x < 8; x++) {
|
||||
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
|
||||
__builtin_mma_xvi8ger4pp(&acc_1, vec_A[x], vec_B[x+8]);
|
||||
__builtin_mma_xvi8ger4pp(& acc_0, vec_A[x], vec_B[x]);
|
||||
__builtin_mma_xvi8ger4pp(& acc_1, vec_A[x], vec_B[x+8]);
|
||||
}
|
||||
for (int I = 0; I<4; I++) {
|
||||
for (int J = 0; J<4; J++) {
|
||||
*((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
|
||||
*((float*)&vs[I+4]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J+4)*ldb)+l)->d));
|
||||
*((float *)& vs[I] + J) = (unhalf((A + ((ii + I) * lda) + l)->d) * unhalf((B + ((jj + J) * ldb) + l)->d));
|
||||
*((float *)& vs[I + 4] + J) = (unhalf((A +((ii + I) * lda) + l)->d) * unhalf((B + ((jj + J + 4) * ldb) + l)->d));
|
||||
}
|
||||
}
|
||||
if (!isAblock_q4) {
|
||||
auto aoffset = A+(ii*lda)+l;
|
||||
auto aoffset = A + (ii * lda) + l;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
comparray[i] = 0;
|
||||
int ca = 0;
|
||||
@@ -2624,15 +2878,14 @@ class tinyBLAS_HP16_PPC {
|
||||
aoffset += lda;
|
||||
}
|
||||
}
|
||||
compute(&acc_0, 0, 0, comparray, vs, fin_res);
|
||||
compute(&acc_1, 0, 4, comparray, vs, fin_res);
|
||||
compute(& acc_0, 0, 0, comparray, vs, fin_res);
|
||||
compute(& acc_1, 0, 4, comparray, vs, fin_res);
|
||||
}
|
||||
save_res(ii, jj, 0, fin_res);
|
||||
save_res(ii, jj+4, 4, fin_res);
|
||||
save_res(ii, jj + 4, 4, fin_res);
|
||||
}
|
||||
|
||||
template<typename TA>
|
||||
void tinyBLAS_Q0_PPC<TA>::KERNEL_8x4(int64_t ii, int64_t jj) {
|
||||
void KERNEL_8x4(int64_t ii, int64_t jj) {
|
||||
vec_t vec_A[16], vec_B[8] = {0};
|
||||
acc_t acc_0, acc_1;
|
||||
std::array<int, 8> comparray {};
|
||||
@@ -2640,25 +2893,25 @@ class tinyBLAS_HP16_PPC {
|
||||
vector float vs[8] = {0};
|
||||
bool isAblock_q4 = std::is_same_v<TA, block_q4_0>;
|
||||
for (int l = 0; l < k; l++) {
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
__builtin_mma_xxsetaccz(&acc_1);
|
||||
__builtin_mma_xxsetaccz(& acc_0);
|
||||
__builtin_mma_xxsetaccz(& acc_1);
|
||||
if (std::is_same_v<TA, block_q4_0>) {
|
||||
packNormalInt4<8>((A+(ii*lda)+l), lda, 8, 4, (int8_t*)vec_A, comparray);
|
||||
packNormalInt4<8>((A + (ii * lda) + l), lda, 8, 4, (int8_t *)vec_A, comparray);
|
||||
} else {
|
||||
packNormal<int8_t, vector signed char>((const block_q8_0*)(A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false);
|
||||
packNormal<int8_t, vector signed char>((const block_q8_0 *)(A + (ii * lda) + l), lda, 8, 8, (int8_t *)vec_A, false);
|
||||
}
|
||||
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 4, 8, (uint8_t*)vec_B, true);
|
||||
packNormal<uint8_t, vector unsigned char>((B + (jj * ldb) + l), ldb, 4, 8, (uint8_t *)vec_B, true);
|
||||
for(int x = 0; x < 8; x++) {
|
||||
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
|
||||
__builtin_mma_xvi8ger4pp(&acc_1, vec_A[x+8], vec_B[x]);
|
||||
__builtin_mma_xvi8ger4pp(& acc_0, vec_A[x], vec_B[x]);
|
||||
__builtin_mma_xvi8ger4pp(& acc_1, vec_A[x + 8], vec_B[x]);
|
||||
}
|
||||
for (int I = 0; I<8; I++) {
|
||||
for (int J = 0; J<4; J++) {
|
||||
*((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
|
||||
for (int I = 0; I < 8; I++) {
|
||||
for (int J = 0; J < 4; J++) {
|
||||
*((float *)&vs[I] + J) = (unhalf((A + ((ii + I) * lda) + l)->d) * unhalf((B + ((jj + J) * ldb) + l)->d));
|
||||
}
|
||||
}
|
||||
if (!isAblock_q4) {
|
||||
auto aoffset = A+(ii*lda)+l;
|
||||
auto aoffset = A + (ii * lda) + l;
|
||||
for (int i = 0; i < 8; i++) {
|
||||
comparray[i] = 0;
|
||||
int ca = 0;
|
||||
@@ -2669,15 +2922,14 @@ class tinyBLAS_HP16_PPC {
|
||||
aoffset += lda;
|
||||
}
|
||||
}
|
||||
compute(&acc_0, 0, 0, comparray, vs, fin_res);
|
||||
compute(&acc_1, 4, 4, comparray, vs, fin_res);
|
||||
compute(& acc_0, 0, 0, comparray, vs, fin_res);
|
||||
compute(& acc_1, 4, 4, comparray, vs, fin_res);
|
||||
}
|
||||
save_res(ii, jj, 0, fin_res);
|
||||
save_res(ii+4, jj, 4, fin_res);
|
||||
save_res(ii + 4, jj, 4, fin_res);
|
||||
}
|
||||
|
||||
template<typename TA>
|
||||
void tinyBLAS_Q0_PPC<TA>::KERNEL_8x8(int64_t ii, int64_t jj) {
|
||||
void KERNEL_8x8(int64_t ii, int64_t jj) {
|
||||
vec_t vec_A[16], vec_B[16] = {0};
|
||||
acc_t acc_0, acc_1, acc_2, acc_3;
|
||||
acc_t acc_4, acc_5, acc_6, acc_7;
|
||||
@@ -2686,30 +2938,30 @@ class tinyBLAS_HP16_PPC {
|
||||
vector float vs[16] = {0};
|
||||
bool isAblock_q4 = std::is_same_v<TA, block_q4_0>;
|
||||
for (int l = 0; l < k; l++) {
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
__builtin_mma_xxsetaccz(&acc_1);
|
||||
__builtin_mma_xxsetaccz(&acc_2);
|
||||
__builtin_mma_xxsetaccz(&acc_3);
|
||||
__builtin_mma_xxsetaccz(& acc_0);
|
||||
__builtin_mma_xxsetaccz(& acc_1);
|
||||
__builtin_mma_xxsetaccz(& acc_2);
|
||||
__builtin_mma_xxsetaccz(& acc_3);
|
||||
if (std::is_same_v<TA, block_q4_0>) {
|
||||
packNormalInt4<8>((A+(ii*lda)+l), lda, 8, 4, (int8_t*)vec_A, comparray);
|
||||
packNormalInt4<8>((A + (ii * lda) + l), lda, 8, 4, (int8_t *)vec_A, comparray);
|
||||
} else {
|
||||
packNormal<int8_t, vector signed char>((const block_q8_0*)(A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false);
|
||||
packNormal<int8_t, vector signed char>((const block_q8_0 *)(A + (ii * lda) + l), lda, 8, 8, (int8_t *)vec_A, false);
|
||||
}
|
||||
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B, true);
|
||||
packNormal<uint8_t, vector unsigned char>((B + (jj * ldb) + l), ldb, 8, 8, (uint8_t *)vec_B, true);
|
||||
for(int x = 0; x < 8; x++) {
|
||||
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
|
||||
__builtin_mma_xvi8ger4pp(&acc_1, vec_A[x+8], vec_B[x]);
|
||||
__builtin_mma_xvi8ger4pp(&acc_2, vec_A[x], vec_B[x+8]);
|
||||
__builtin_mma_xvi8ger4pp(&acc_3, vec_A[x+8], vec_B[x+8]);
|
||||
__builtin_mma_xvi8ger4pp(& acc_0, vec_A[x], vec_B[x]);
|
||||
__builtin_mma_xvi8ger4pp(& acc_1, vec_A[x + 8], vec_B[x]);
|
||||
__builtin_mma_xvi8ger4pp(& acc_2, vec_A[x], vec_B[x + 8]);
|
||||
__builtin_mma_xvi8ger4pp(& acc_3, vec_A[x + 8], vec_B[x + 8]);
|
||||
}
|
||||
for (int I = 0; I<8; I++) {
|
||||
for (int J = 0; J<4; J++) {
|
||||
*((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
|
||||
*((float*)&vs[I+8]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J+4)*ldb)+l)->d));
|
||||
for (int I = 0; I < 8 ; I++) {
|
||||
for (int J = 0; J < 4; J++) {
|
||||
*((float *)& vs[I] + J) = (unhalf((A + ((ii + I) * lda) + l)->d) * unhalf((B + ((jj + J) * ldb) + l)->d));
|
||||
*((float *)& vs[I + 8] + J) = (unhalf((A + ((ii + I) * lda) + l)->d) * unhalf((B + ((jj + J + 4) * ldb) + l)->d));
|
||||
}
|
||||
}
|
||||
if (!isAblock_q4) {
|
||||
auto aoffset = A+(ii*lda)+l;
|
||||
auto aoffset = A + (ii * lda) + l;
|
||||
for (int i = 0; i < 8; i++) {
|
||||
comparray[i] = 0;
|
||||
int ca = 0;
|
||||
@@ -2720,19 +2972,99 @@ class tinyBLAS_HP16_PPC {
|
||||
aoffset += lda;
|
||||
}
|
||||
}
|
||||
compute(&acc_0, 0, 0, comparray, vs, fin_res);
|
||||
compute(&acc_1, 4, 4, comparray, vs, fin_res);
|
||||
compute(&acc_2, 0, 8, comparray, vs, fin_res);
|
||||
compute(&acc_3, 4, 12, comparray, vs, fin_res);
|
||||
compute(& acc_0, 0, 0, comparray, vs, fin_res);
|
||||
compute(& acc_1, 4, 4, comparray, vs, fin_res);
|
||||
compute(& acc_2, 0, 8, comparray, vs, fin_res);
|
||||
compute(& acc_3, 4, 12, comparray, vs, fin_res);
|
||||
}
|
||||
save_res(ii, jj, 0, fin_res);
|
||||
save_res(ii+4, jj, 4, fin_res);
|
||||
save_res(ii, jj+4, 8, fin_res);
|
||||
save_res(ii+4, jj+4, 12, fin_res);
|
||||
save_res(ii + 4, jj, 4, fin_res);
|
||||
save_res(ii, jj + 4, 8, fin_res);
|
||||
save_res(ii + 4, jj + 4, 12, fin_res);
|
||||
}
|
||||
|
||||
template<typename TA>
|
||||
void tinyBLAS_Q0_PPC<TA>::gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n, int RM, int RN) {
|
||||
void KERNEL_Q0(int64_t ii, int64_t jj, int64_t mc, int64_t nc, int64_t kc, int64_t l, vec_t * vec_A, vec_t * vec_B) {
|
||||
acc_t acc[8];
|
||||
for (int i = 0; i < mc ; i += 16) {
|
||||
for (int j = 0; j < nc; j += 8) {
|
||||
int A0_base = (i / 16) * (2 * 32 * kc);
|
||||
int B0_base = (j / 8) * (32 * kc);
|
||||
for (int x = 0; x < 8; x++) {
|
||||
__builtin_mma_xxsetaccz(&acc[x]);
|
||||
}
|
||||
for (int64_t kk = 0; kk < kc; kk++) {
|
||||
int A0_block_idx = A0_base + kk * 32;
|
||||
int B0_block_idx = B0_base + kk * 32;
|
||||
int A1_block_idx = A0_block_idx + 32 * kc;
|
||||
int B1_block_idx = B0_block_idx + 32 * kc;
|
||||
vec_t * A0_block = & vec_A[A0_block_idx];
|
||||
vec_t * B0_block = & vec_B[B0_block_idx];
|
||||
vec_t * A1_block = & vec_A[A1_block_idx];
|
||||
for (int it = 0; it < 4; it++) {
|
||||
for (int x = 0; x < 4; x++) {
|
||||
__builtin_mma_xvf16ger2pp(& acc[0], A0_block[8 * it + x], B0_block[8 * it + x]);
|
||||
__builtin_mma_xvf16ger2pp(& acc[1], A0_block[8 * it + x], B0_block[8 * it + x + 4]);
|
||||
__builtin_mma_xvf16ger2pp(& acc[2], A0_block[8 * it + x + 4], B0_block[8 * it + x]);
|
||||
__builtin_mma_xvf16ger2pp(& acc[3], A0_block[8 * it + x + 4], B0_block[8 * it + x + 4]);
|
||||
__builtin_mma_xvf16ger2pp(& acc[4], A1_block[8 * it + x], B0_block[8 * it + x]);
|
||||
__builtin_mma_xvf16ger2pp(& acc[5], A1_block[8 * it + x], B0_block[8 * it+ x + 4]);
|
||||
__builtin_mma_xvf16ger2pp(& acc[6], A1_block[8 * it + x + 4], B0_block[8 * it + x]);
|
||||
__builtin_mma_xvf16ger2pp(& acc[7], A1_block[8 * it + x + 4], B0_block[8 * it + x + 4]);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (l == 0) {
|
||||
save_acc(& acc[0], ii + i, jj + j);
|
||||
save_acc(& acc[1], ii + i, jj + j + 4);
|
||||
save_acc(& acc[2], ii + i + 4, jj + j);
|
||||
save_acc(& acc[3], ii + i + 4, jj + j + 4);
|
||||
save_acc(& acc[4], ii + i + 8, jj + j);
|
||||
save_acc(& acc[5], ii + i + 8, jj + j + 4);
|
||||
save_acc(& acc[6], ii + i + 12, jj + j);
|
||||
save_acc(& acc[7], ii + i + 12, jj + j + 4);
|
||||
} else {
|
||||
add_save_acc(& acc[0], ii + i, jj + j);
|
||||
add_save_acc(& acc[1], ii + i, jj + j + 4);
|
||||
add_save_acc(& acc[2], ii + i + 4, jj + j);
|
||||
add_save_acc(& acc[3], ii + i + 4, jj + j + 4);
|
||||
add_save_acc(& acc[4], ii + i + 8, jj + j);
|
||||
add_save_acc(& acc[5], ii + i + 8, jj + j + 4);
|
||||
add_save_acc(& acc[6], ii + i + 12, jj + j);
|
||||
add_save_acc(& acc[7], ii + i + 12, jj + j + 4);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void matmul_tiled(int64_t m, int64_t n, int64_t mc, int64_t nc, int64_t kc) {
|
||||
vec_t A_pack[mc * kc * 4];
|
||||
vec_t B_pack[nc * kc * 4];
|
||||
constexpr bool is_Ablock_q4 = std::is_same_v<TA, block_q4_0>;
|
||||
int64_t ytiles = m / mc;
|
||||
int64_t xtiles = n / nc;
|
||||
int64_t tiles = xtiles * ytiles;
|
||||
int64_t duty = (tiles + nth - 1) / nth;
|
||||
int64_t start = duty * ith;
|
||||
int64_t end = start + duty;
|
||||
if (end > tiles) {
|
||||
end = tiles;
|
||||
}
|
||||
for (int64_t job = start; job < end; ++job) {
|
||||
int64_t ii = (job / xtiles) * mc;
|
||||
int64_t jj = (job % xtiles) * nc;
|
||||
for (int64_t kk = 0; kk < k; kk += kc) {
|
||||
if constexpr(is_Ablock_q4) {
|
||||
packNormal_q4_fp16(A + ii * lda + kk, lda, mc, kc, (uint8_t *)A_pack);
|
||||
} else {
|
||||
packNormal_q8_fp16(A + ii * lda + kk, lda, mc, kc, (uint8_t *)A_pack);
|
||||
}
|
||||
packNormal_q8_fp16(B + jj * ldb + kk, ldb, nc, kc, (uint8_t *)B_pack);
|
||||
KERNEL_Q0(ii, jj, mc, nc, kc, kk, A_pack, B_pack);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n, int RM, int RN) {
|
||||
int64_t ytiles = (m - m0) / RM;
|
||||
int64_t xtiles = (n - n0) / RN;
|
||||
int64_t tiles = xtiles * ytiles;
|
||||
@@ -2754,32 +3086,32 @@ class tinyBLAS_HP16_PPC {
|
||||
vector float fin_res[4] = {0};
|
||||
vector float vs[4] = {0};
|
||||
vector float CA[4] = {0};
|
||||
__builtin_prefetch((A+(ii*lda)+0)->qs, 0, 1); // prefetch first value
|
||||
__builtin_prefetch((B+(jj*ldb)+0)->qs, 0, 1); // prefetch first value
|
||||
__builtin_prefetch((A + (ii * lda) + 0)->qs, 0, 1); // prefetch first value
|
||||
__builtin_prefetch((B + (jj * ldb) + 0)->qs, 0, 1); // prefetch first value
|
||||
for (int l = 0; l < k; l++) {
|
||||
__builtin_prefetch((A+(ii*lda)+(l+1))->qs, 0, 1); // prefetch one loop ahead
|
||||
__builtin_prefetch((B+(jj*ldb)+(l+1))->qs, 0, 1); // prefetch one loop ahead
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
__builtin_prefetch((A + (ii * lda) + (l + 1))->qs, 0, 1); // prefetch one loop ahead
|
||||
__builtin_prefetch((B + (jj * ldb) + (l + 1))->qs, 0, 1); // prefetch one loop ahead
|
||||
__builtin_mma_xxsetaccz(& acc_0);
|
||||
if (isAblock_q4) {
|
||||
packNormalInt4<4>((A+(ii*lda)+l), lda, RM, 4, (int8_t*)vec_A, comparray);
|
||||
packNormalInt4<4>((A + (ii * lda) + l), lda, RM, 4, (int8_t *)vec_A, comparray);
|
||||
} else {
|
||||
packNormal<int8_t, vector signed char>((const block_q8_0*)(A+(ii*lda)+l), lda, RM, 8, (int8_t*)vec_A, false);
|
||||
packNormal<int8_t, vector signed char>((const block_q8_0 *)(A + (ii * lda) + l), lda, RM, 8, (int8_t *)vec_A, false);
|
||||
}
|
||||
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, RN, 8, (uint8_t*)vec_B, true);
|
||||
for(int x = 0; x < 8; x+=4) {
|
||||
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
|
||||
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x+1], vec_B[x+1]);
|
||||
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x+2], vec_B[x+2]);
|
||||
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x+3], vec_B[x+3]);
|
||||
packNormal<uint8_t, vector unsigned char>((B + (jj * ldb) + l), ldb, RN, 8, (uint8_t *)vec_B, true);
|
||||
for (int x = 0; x < 8; x += 4) {
|
||||
__builtin_mma_xvi8ger4pp(& acc_0, vec_A[x], vec_B[x]);
|
||||
__builtin_mma_xvi8ger4pp(& acc_0, vec_A[x + 1], vec_B[x + 1]);
|
||||
__builtin_mma_xvi8ger4pp(& acc_0, vec_A[x + 2], vec_B[x + 2]);
|
||||
__builtin_mma_xvi8ger4pp(& acc_0, vec_A[x + 3], vec_B[x + 3]);
|
||||
}
|
||||
for (int I = 0; I<RM; I++) {
|
||||
for (int J = 0; J<RN; J++) {
|
||||
*((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
|
||||
for (int I = 0; I < RM; I++) {
|
||||
for (int J = 0; J < RN; J++) {
|
||||
*((float*)&vs[I] + J) = (unhalf((A + ((ii + I) * lda) + l)->d) * unhalf((B + ((jj + J) * ldb) + l)->d));
|
||||
}
|
||||
}
|
||||
__builtin_mma_disassemble_acc(vec_C, &acc_0);
|
||||
__builtin_mma_disassemble_acc(vec_C, & acc_0);
|
||||
if (!isAblock_q4) {
|
||||
auto aoffset = A+(ii*lda)+l;
|
||||
auto aoffset = A + (ii * lda) + l;
|
||||
for (int i = 0; i < RM; i++) {
|
||||
comparray[i] = 0;
|
||||
int ca = 0;
|
||||
@@ -2800,9 +3132,21 @@ class tinyBLAS_HP16_PPC {
|
||||
}
|
||||
}
|
||||
|
||||
template<typename TA>
|
||||
template<int RM, int RN>
|
||||
inline void kernel(int64_t ii, int64_t jj) {
|
||||
if constexpr(RM == 4 && RN == 8) {
|
||||
KERNEL_4x8(ii,jj);
|
||||
} else if constexpr(RM == 8 && RN == 4) {
|
||||
KERNEL_8x4(ii,jj);
|
||||
} else if constexpr(RM == 8 && RN == 8) {
|
||||
KERNEL_8x8(ii,jj);
|
||||
} else {
|
||||
assert(false && "RN/RM values not supported");
|
||||
}
|
||||
}
|
||||
|
||||
template <int RM, int RN>
|
||||
NOINLINE void tinyBLAS_Q0_PPC<TA>::gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||
NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||
int64_t ytiles = (m - m0) / RM;
|
||||
int64_t xtiles = (n - n0) / RN;
|
||||
int64_t tiles = xtiles * ytiles;
|
||||
@@ -2814,12 +3158,20 @@ class tinyBLAS_HP16_PPC {
|
||||
for (int64_t job = start; job < end; ++job) {
|
||||
int64_t ii = m0 + job / xtiles * RM;
|
||||
int64_t jj = n0 + job % xtiles * RN;
|
||||
this->kernel<RM, RN>(ii, jj);
|
||||
kernel<RM, RN>(ii, jj);
|
||||
}
|
||||
}
|
||||
|
||||
template class tinyBLAS_Q0_PPC<block_q4_0>;
|
||||
template class tinyBLAS_Q0_PPC<block_q8_0>;
|
||||
const TA * const A;
|
||||
const block_q8_0 * const B;
|
||||
float * C;
|
||||
const int64_t k;
|
||||
int64_t kc;
|
||||
const int64_t lda;
|
||||
const int64_t ldb;
|
||||
const int64_t ldc;
|
||||
const int ith;
|
||||
const int nth;
|
||||
};
|
||||
|
||||
class tinyBLAS_PPC {
|
||||
public:
|
||||
|
||||
@@ -1186,8 +1186,10 @@ static void launch_fattn_tile_switch_ncols2(ggml_backend_cuda_context & ctx, ggm
|
||||
GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
|
||||
const int gqa_ratio = Q->ne[2] / K->ne[2];
|
||||
|
||||
// On NVIDIA (Pascal and older) the GQA optimizations seem to be detrimental in some cases.
|
||||
// However, for DKQ == 576, DV == 512 only the kernel variant with GQA optimizations is implemented.
|
||||
const bool nvidia = GGML_CUDA_CC_IS_NVIDIA(ggml_cuda_info().devices[ggml_cuda_get_device()].cc);
|
||||
const int gqa_limit = nvidia && gqa_ratio <= 4 ? 16 : INT_MAX;
|
||||
const int gqa_limit = nvidia && gqa_ratio <= 4 && DV <= 256 ? 16 : INT_MAX;
|
||||
const bool use_gqa_opt = mask && max_bias == 0.0f && Q->ne[1] <= gqa_limit && K->ne[1] % FATTN_KQ_STRIDE == 0;
|
||||
|
||||
if constexpr (DV == 512) {
|
||||
|
||||
@@ -57,6 +57,8 @@ layout (push_constant) uniform parameter
|
||||
uint nbi1;
|
||||
uint ne11;
|
||||
#else
|
||||
uint base_work_group_z;
|
||||
uint num_batches;
|
||||
uint k_split;
|
||||
uint ne02;
|
||||
uint ne12;
|
||||
@@ -108,7 +110,7 @@ void main() {
|
||||
const uint ic = gl_WorkGroupID.y;
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
const uint expert_idx = gl_GlobalInvocationID.z;
|
||||
const uint expert_idx = gl_WorkGroupID.z;
|
||||
if (ic * BN >= data_expert_count[expert_idx]) {
|
||||
return;
|
||||
}
|
||||
@@ -118,7 +120,7 @@ void main() {
|
||||
#endif
|
||||
|
||||
#ifndef MUL_MAT_ID
|
||||
const uint batch_idx = gl_GlobalInvocationID.z;
|
||||
const uint batch_idx = gl_WorkGroupID.z + p.base_work_group_z;
|
||||
|
||||
const uint i13 = batch_idx / p.ne12;
|
||||
const uint i12 = batch_idx % p.ne12;
|
||||
@@ -276,7 +278,7 @@ void main() {
|
||||
const uint dc = ic * BN + warp_c * WN;
|
||||
|
||||
#ifndef MUL_MAT_ID
|
||||
const uint offsets = batch_idx * p.batch_stride_d + ik * p.batch_stride_d * gl_NumWorkGroups.z;
|
||||
const uint offsets = batch_idx * p.batch_stride_d + ik * p.batch_stride_d * p.num_batches;
|
||||
#endif
|
||||
|
||||
[[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) {
|
||||
|
||||
@@ -2008,6 +2008,14 @@ static std::optional<webgpu_command> ggml_webgpu_encode_node(webgpu_context ctx,
|
||||
return ggml_webgpu_unary_op(ctx, src0, node);
|
||||
case GGML_OP_LOG:
|
||||
return ggml_webgpu_unary_op(ctx, src0, node);
|
||||
case GGML_OP_SQR:
|
||||
return ggml_webgpu_unary_op(ctx, src0, node);
|
||||
case GGML_OP_SQRT:
|
||||
return ggml_webgpu_unary_op(ctx, src0, node);
|
||||
case GGML_OP_SIN:
|
||||
return ggml_webgpu_unary_op(ctx, src0, node);
|
||||
case GGML_OP_COS:
|
||||
return ggml_webgpu_unary_op(ctx, src0, node);
|
||||
case GGML_OP_PAD:
|
||||
return ggml_webgpu_pad(ctx, src0, node);
|
||||
case GGML_OP_ARGMAX:
|
||||
@@ -2967,6 +2975,18 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
|
||||
case GGML_OP_LOG:
|
||||
supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && (src0->type == op->type);
|
||||
break;
|
||||
case GGML_OP_SQR:
|
||||
supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && (src0->type == op->type);
|
||||
break;
|
||||
case GGML_OP_SQRT:
|
||||
supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && (src0->type == op->type);
|
||||
break;
|
||||
case GGML_OP_SIN:
|
||||
supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && (src0->type == op->type);
|
||||
break;
|
||||
case GGML_OP_COS:
|
||||
supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && (src0->type == op->type);
|
||||
break;
|
||||
case GGML_OP_PAD:
|
||||
supports_op = op->type == GGML_TYPE_F32 && src0->type == GGML_TYPE_F32;
|
||||
break;
|
||||
|
||||
@@ -170,6 +170,20 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
|
||||
#ifdef TRUNC
|
||||
let res = trunc(src[params.offset_src + src_idx]);
|
||||
#endif
|
||||
#ifdef SQR
|
||||
let res = src[params.offset_src + src_idx] * src[params.offset_src + src_idx];
|
||||
#endif
|
||||
#ifdef SQRT
|
||||
let res = sqrt(src[params.offset_src + src_idx]);
|
||||
#endif
|
||||
#ifdef SIN
|
||||
let res_f32 = sin(f32(src[params.offset_src + src_idx]));
|
||||
let res = TYPE(res_f32);
|
||||
#endif
|
||||
#ifdef COS
|
||||
let res_f32 = cos(f32(src[params.offset_src + src_idx]));
|
||||
let res = TYPE(res_f32);
|
||||
#endif
|
||||
|
||||
#ifdef INPLACE
|
||||
src[params.offset_src + src_idx] = res;
|
||||
|
||||
@@ -435,6 +435,7 @@ class MODEL_ARCH(IntEnum):
|
||||
T5 = auto()
|
||||
T5ENCODER = auto()
|
||||
JAIS = auto()
|
||||
JAIS2 = auto()
|
||||
NEMOTRON = auto()
|
||||
NEMOTRON_H = auto()
|
||||
NEMOTRON_H_MOE = auto()
|
||||
@@ -472,6 +473,7 @@ class MODEL_ARCH(IntEnum):
|
||||
RND1 = auto()
|
||||
PANGU_EMBED = auto()
|
||||
MISTRAL3 = auto()
|
||||
PADDLEOCR = auto()
|
||||
MIMO2 = auto()
|
||||
STEP35 = auto()
|
||||
LLAMA_EMBED = auto()
|
||||
@@ -652,6 +654,7 @@ class MODEL_TENSOR(IntEnum):
|
||||
ENC_OUTPUT_NORM = auto()
|
||||
CLS = auto() # classifier
|
||||
CLS_OUT = auto() # classifier output projection
|
||||
CLS_NORM = auto()
|
||||
CONV1D = auto()
|
||||
CONVNEXT_DW = auto()
|
||||
CONVNEXT_NORM = auto()
|
||||
@@ -873,6 +876,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.T5: "t5",
|
||||
MODEL_ARCH.T5ENCODER: "t5encoder",
|
||||
MODEL_ARCH.JAIS: "jais",
|
||||
MODEL_ARCH.JAIS2: "jais2",
|
||||
MODEL_ARCH.NEMOTRON: "nemotron",
|
||||
MODEL_ARCH.NEMOTRON_H: "nemotron_h",
|
||||
MODEL_ARCH.NEMOTRON_H_MOE: "nemotron_h_moe",
|
||||
@@ -911,6 +915,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.RND1: "rnd1",
|
||||
MODEL_ARCH.PANGU_EMBED: "pangu-embedded",
|
||||
MODEL_ARCH.MISTRAL3: "mistral3",
|
||||
MODEL_ARCH.PADDLEOCR: "paddleocr",
|
||||
MODEL_ARCH.MIMO2: "mimo2",
|
||||
MODEL_ARCH.STEP35: "step35",
|
||||
MODEL_ARCH.LLAMA_EMBED: "llama-embed",
|
||||
@@ -1088,6 +1093,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm",
|
||||
MODEL_TENSOR.CLS: "cls",
|
||||
MODEL_TENSOR.CLS_OUT: "cls.output",
|
||||
MODEL_TENSOR.CLS_NORM: "cls.norm",
|
||||
MODEL_TENSOR.CONV1D: "conv1d",
|
||||
MODEL_TENSOR.CONVNEXT_DW: "convnext.{bid}.dw",
|
||||
MODEL_TENSOR.CONVNEXT_NORM: "convnext.{bid}.norm",
|
||||
@@ -1507,6 +1513,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.CLS,
|
||||
MODEL_TENSOR.CLS_OUT,
|
||||
MODEL_TENSOR.CLS_NORM,
|
||||
],
|
||||
MODEL_ARCH.NOMIC_BERT: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
@@ -2814,6 +2821,19 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.JAIS2: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.NEMOTRON: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
@@ -3168,6 +3188,20 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.PADDLEOCR: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.FALCON_H1: [
|
||||
# Token embedding
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
@@ -3829,6 +3863,7 @@ class VisionProjectorType:
|
||||
VOXTRAL = "voxtral"
|
||||
LFM2 = "lfm2"
|
||||
KIMIVL = "kimivl"
|
||||
PADDLEOCR = "paddleocr"
|
||||
KIMIK25 = "kimik25"
|
||||
LIGHTONOCR = "lightonocr"
|
||||
COGVLM = "cogvlm"
|
||||
|
||||
@@ -1240,6 +1240,10 @@ class TensorNameMap:
|
||||
MODEL_TENSOR.CLS_OUT: (
|
||||
"classifier.out_proj", # roberta
|
||||
),
|
||||
|
||||
MODEL_TENSOR.CLS_NORM: (
|
||||
"head.norm", # modern-bert
|
||||
),
|
||||
#############################################################################
|
||||
|
||||
MODEL_TENSOR.CONVNEXT_DW: (
|
||||
@@ -1321,6 +1325,7 @@ class TensorNameMap:
|
||||
"multi_modal_projector.linear_{bid}",
|
||||
"mm_projector.proj.linear_{bid}", # Kimi-K2.5
|
||||
"visual.merger.mlp.{bid}", # qwen2vl
|
||||
"mlp_AR.linear_{bid}", # PaddleOCR-VL
|
||||
"merger.mlp.{bid}",
|
||||
),
|
||||
|
||||
@@ -1570,6 +1575,7 @@ class TensorNameMap:
|
||||
"mm_projector.pre_norm", # Kimi-K2.5
|
||||
"pre_mm_projector_norm",
|
||||
"model.vision.linear_proj.norm1", # cogvlm
|
||||
"mlp_AR.pre_norm", # PaddleOCR-VL
|
||||
"merger.ln_q",
|
||||
),
|
||||
|
||||
@@ -1595,6 +1601,7 @@ class TensorNameMap:
|
||||
|
||||
MODEL_TENSOR.V_RESMPL_ATTN_OUT: (
|
||||
"resampler.attn.out_proj",
|
||||
"model.vision_model.head.attention.out_proj",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_RESMPL_KV: (
|
||||
|
||||
@@ -389,6 +389,7 @@ extern "C" {
|
||||
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||
bool pure; // quantize all tensors to the default type
|
||||
bool keep_split; // quantize to the same number of shards
|
||||
bool dry_run; // calculate and show the final quantization size without performing quantization
|
||||
void * imatrix; // pointer to importance matrix data
|
||||
void * kv_overrides; // pointer to vector containing overrides
|
||||
void * tensor_types; // pointer to vector containing tensor types
|
||||
|
||||
80
models/templates/stepfun-ai-Step-3.5-Flash.jinja
Normal file
80
models/templates/stepfun-ai-Step-3.5-Flash.jinja
Normal file
@@ -0,0 +1,80 @@
|
||||
{% macro render_content(content) %}{% if content is none %}{{- '' }}{% elif content is string %}{{- content }}{% elif content is mapping %}{{- content['value'] if 'value' in content else content['text'] }}{% elif content is iterable %}{% for item in content %}{% if item.type == 'text' %}{{- item['value'] if 'value' in item else item['text'] }}{% elif item.type == 'image' %}<im_patch>{% endif %}{% endfor %}{% endif %}{% endmacro %}
|
||||
{{bos_token}}{%- if tools %}
|
||||
{{- '<|im_start|>system\n' }}
|
||||
{%- if messages[0].role == 'system' %}
|
||||
{{- render_content(messages[0].content) + '\n\n' }}
|
||||
{%- endif %}
|
||||
{{- "# Tools\n\nYou have access to the following functions in JSONSchema format:\n\n<tools>" }}
|
||||
{%- for tool in tools %}
|
||||
{{- "\n" }}
|
||||
{{- tool | tojson(ensure_ascii=False) }}
|
||||
{%- endfor %}
|
||||
{{- "\n</tools>\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...>\n...\n</function> block must be nested within <tool_call>\n...\n</tool_call> XML tags\n- Required parameters MUST be specified\n</IMPORTANT><|im_end|>\n" }}
|
||||
{%- else %}
|
||||
{%- if messages[0].role == 'system' %}
|
||||
{{- '<|im_start|>system\n' + render_content(messages[0].content) + '<|im_end|>\n' }}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
|
||||
{%- for message in messages[::-1] %}
|
||||
{%- set index = (messages|length - 1) - loop.index0 %}
|
||||
{%- if ns.multi_step_tool and message.role == "user" and render_content(message.content) is string and not(render_content(message.content).startswith('<tool_response>') and render_content(message.content).endswith('</tool_response>')) %}
|
||||
{%- set ns.multi_step_tool = false %}
|
||||
{%- set ns.last_query_index = index %}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
{%- for message in messages %}
|
||||
{%- set content = render_content(message.content) %}
|
||||
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
|
||||
{%- set role_name = 'observation' if (message.role == "system" and not loop.first and message.name == 'observation') else message.role %}
|
||||
{{- '<|im_start|>' + role_name + '\n' + content + '<|im_end|>' + '\n' }}
|
||||
{%- elif message.role == "assistant" %}
|
||||
{%- if message.reasoning_content is string %}
|
||||
{%- set reasoning_content = render_content(message.reasoning_content) %}
|
||||
{%- else %}
|
||||
{%- if '</think>' in content %}
|
||||
{%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
|
||||
{%- set content = content.split('</think>')[-1].lstrip('\n') %}
|
||||
{%- else %}
|
||||
{%- set reasoning_content = '' %}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{%- if loop.index0 > ns.last_query_index %}
|
||||
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content + '\n</think>\n' + content }}
|
||||
{%- else %}
|
||||
{{- '<|im_start|>' + message.role + '\n' + content }}
|
||||
{%- endif %}
|
||||
{%- if message.tool_calls %}
|
||||
{%- for tool_call in message.tool_calls %}
|
||||
{%- if tool_call.function is defined %}
|
||||
{%- set tool_call = tool_call.function %}
|
||||
{%- endif %}
|
||||
{{- '<tool_call>\n<function=' + tool_call.name + '>\n' }}
|
||||
{%- if tool_call.arguments is defined %}
|
||||
{%- set arguments = tool_call.arguments %}
|
||||
{%- for args_name, args_value in arguments|items %}
|
||||
{{- '<parameter=' + args_name + '>\n' }}
|
||||
{%- set args_value = args_value | tojson(ensure_ascii=False) | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}
|
||||
{{- args_value }}
|
||||
{{- '\n</parameter>\n' }}
|
||||
{%- endfor %}
|
||||
{%- endif %}
|
||||
{{- '</function>\n</tool_call>' }}
|
||||
{%- endfor %}
|
||||
{%- endif %}
|
||||
{{- '<|im_end|>\n' }}
|
||||
{%- elif message.role == "tool" %}
|
||||
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
|
||||
{{- '<|im_start|>tool_response\n' }}
|
||||
{%- endif %}
|
||||
{{- '<tool_response>' }}
|
||||
{{- content }}
|
||||
{{- '</tool_response>' }}
|
||||
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
||||
{{- '<|im_end|>\n' }}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
{%- if add_generation_prompt %}
|
||||
{{- '<|im_start|>assistant\n<think>\n' }}
|
||||
{%- endif %}
|
||||
@@ -84,6 +84,7 @@ add_library(llama
|
||||
models/hunyuan-moe.cpp
|
||||
models/internlm2.cpp
|
||||
models/jais.cpp
|
||||
models/jais2.cpp
|
||||
models/jamba.cpp
|
||||
models/kimi-linear.cpp
|
||||
models/lfm2.cpp
|
||||
@@ -109,6 +110,7 @@ add_library(llama
|
||||
models/openai-moe-iswa.cpp
|
||||
models/openelm.cpp
|
||||
models/orion.cpp
|
||||
models/paddleocr.cpp
|
||||
models/pangu-embedded.cpp
|
||||
models/phi2.cpp
|
||||
models/phi3.cpp
|
||||
|
||||
@@ -79,6 +79,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_T5, "t5" },
|
||||
{ LLM_ARCH_T5ENCODER, "t5encoder" },
|
||||
{ LLM_ARCH_JAIS, "jais" },
|
||||
{ LLM_ARCH_JAIS2, "jais2" },
|
||||
{ LLM_ARCH_NEMOTRON, "nemotron" },
|
||||
{ LLM_ARCH_NEMOTRON_H, "nemotron_h" },
|
||||
{ LLM_ARCH_NEMOTRON_H_MOE, "nemotron_h_moe" },
|
||||
@@ -120,6 +121,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_RND1, "rnd1" },
|
||||
{ LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
|
||||
{ LLM_ARCH_MISTRAL3, "mistral3" },
|
||||
{ LLM_ARCH_PADDLEOCR, "paddleocr" },
|
||||
{ LLM_ARCH_MIMO2, "mimo2" },
|
||||
{ LLM_ARCH_STEP35, "step35" },
|
||||
{ LLM_ARCH_LLAMA_EMBED, "llama-embed" },
|
||||
@@ -367,6 +369,7 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
|
||||
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
||||
{ LLM_TENSOR_CLS, "cls" },
|
||||
{ LLM_TENSOR_CLS_OUT, "cls.output" },
|
||||
{ LLM_TENSOR_CLS_NORM, "cls.norm" },
|
||||
{ LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
|
||||
{ LLM_TENSOR_SSM_A_NOSCAN, "blk.%d.ssm_a" },
|
||||
@@ -737,6 +740,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
||||
case LLM_ARCH_INTERNLM2:
|
||||
case LLM_ARCH_GRANITE:
|
||||
case LLM_ARCH_ERNIE4_5:
|
||||
case LLM_ARCH_PADDLEOCR:
|
||||
case LLM_ARCH_SMOLLM3:
|
||||
case LLM_ARCH_DREAM:
|
||||
case LLM_ARCH_LLADA:
|
||||
@@ -828,6 +832,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
||||
LLM_TENSOR_FFN_NORM,
|
||||
LLM_TENSOR_CLS,
|
||||
LLM_TENSOR_CLS_OUT,
|
||||
LLM_TENSOR_CLS_NORM,
|
||||
};
|
||||
case LLM_ARCH_JINA_BERT_V2:
|
||||
return {
|
||||
@@ -1789,6 +1794,20 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
||||
LLM_TENSOR_FFN_GATE,
|
||||
LLM_TENSOR_FFN_DOWN,
|
||||
};
|
||||
case LLM_ARCH_JAIS2:
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
LLM_TENSOR_OUTPUT,
|
||||
LLM_TENSOR_ATTN_NORM,
|
||||
LLM_TENSOR_ATTN_Q,
|
||||
LLM_TENSOR_ATTN_K,
|
||||
LLM_TENSOR_ATTN_V,
|
||||
LLM_TENSOR_ATTN_OUT,
|
||||
LLM_TENSOR_FFN_NORM,
|
||||
LLM_TENSOR_FFN_UP,
|
||||
LLM_TENSOR_FFN_DOWN,
|
||||
};
|
||||
case LLM_ARCH_NEMOTRON_H:
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
@@ -2518,6 +2537,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
{LLM_TENSOR_OUTPUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_CLS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_CLS_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_CLS_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_DENSE_2_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, // Dense layer output
|
||||
{LLM_TENSOR_DENSE_3_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, // Dense layer output
|
||||
{LLM_TENSOR_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
|
||||
|
||||
@@ -83,6 +83,7 @@ enum llm_arch {
|
||||
LLM_ARCH_T5,
|
||||
LLM_ARCH_T5ENCODER,
|
||||
LLM_ARCH_JAIS,
|
||||
LLM_ARCH_JAIS2,
|
||||
LLM_ARCH_NEMOTRON,
|
||||
LLM_ARCH_NEMOTRON_H,
|
||||
LLM_ARCH_NEMOTRON_H_MOE,
|
||||
@@ -124,6 +125,7 @@ enum llm_arch {
|
||||
LLM_ARCH_RND1,
|
||||
LLM_ARCH_PANGU_EMBED,
|
||||
LLM_ARCH_MISTRAL3,
|
||||
LLM_ARCH_PADDLEOCR,
|
||||
LLM_ARCH_MIMO2,
|
||||
LLM_ARCH_STEP35,
|
||||
LLM_ARCH_LLAMA_EMBED,
|
||||
@@ -497,6 +499,7 @@ enum llm_tensor {
|
||||
LLM_TENSOR_ENC_OUTPUT_NORM,
|
||||
LLM_TENSOR_CLS,
|
||||
LLM_TENSOR_CLS_OUT,
|
||||
LLM_TENSOR_CLS_NORM,
|
||||
LLM_TENSOR_CONV1D,
|
||||
LLM_TENSOR_CONVNEXT_DW,
|
||||
LLM_TENSOR_CONVNEXT_NORM,
|
||||
|
||||
@@ -712,8 +712,6 @@ int64_t llama_context::output_resolve_row(int32_t i) const {
|
||||
}
|
||||
|
||||
float * llama_context::get_logits_ith(int32_t i) {
|
||||
int64_t j = -1;
|
||||
|
||||
output_reorder();
|
||||
|
||||
try {
|
||||
@@ -721,26 +719,7 @@ float * llama_context::get_logits_ith(int32_t i) {
|
||||
throw std::runtime_error("no logits");
|
||||
}
|
||||
|
||||
// TODO: use output_resolve_row()
|
||||
if (i < 0) {
|
||||
j = n_outputs + i;
|
||||
if (j < 0) {
|
||||
throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs));
|
||||
}
|
||||
} else if ((size_t) i >= output_ids.size()) {
|
||||
throw std::runtime_error(format("out of range [0, %zu)", output_ids.size()));
|
||||
} else {
|
||||
j = output_ids[i];
|
||||
}
|
||||
|
||||
if (j < 0) {
|
||||
throw std::runtime_error(format("batch.logits[%d] != true", i));
|
||||
}
|
||||
if (j >= n_outputs) {
|
||||
// This should not happen
|
||||
throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
|
||||
}
|
||||
|
||||
const int64_t j = output_resolve_row(i);
|
||||
return logits.data + j*model.vocab.n_tokens();
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
|
||||
@@ -763,8 +742,6 @@ llama_token * llama_context::get_sampled_tokens() const{
|
||||
}
|
||||
|
||||
float * llama_context::get_embeddings_ith(int32_t i) {
|
||||
int64_t j = -1;
|
||||
|
||||
output_reorder();
|
||||
|
||||
try {
|
||||
@@ -772,26 +749,7 @@ float * llama_context::get_embeddings_ith(int32_t i) {
|
||||
throw std::runtime_error("no embeddings");
|
||||
}
|
||||
|
||||
// TODO: use output_resolve_row()
|
||||
if (i < 0) {
|
||||
j = n_outputs + i;
|
||||
if (j < 0) {
|
||||
throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs));
|
||||
}
|
||||
} else if ((size_t) i >= output_ids.size()) {
|
||||
throw std::runtime_error(format("out of range [0, %zu)", output_ids.size()));
|
||||
} else {
|
||||
j = output_ids[i];
|
||||
}
|
||||
|
||||
if (j < 0) {
|
||||
throw std::runtime_error(format("batch.logits[%d] != true", i));
|
||||
}
|
||||
if (j >= n_outputs) {
|
||||
// This should not happen
|
||||
throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
|
||||
}
|
||||
|
||||
const int64_t j = output_resolve_row(i);
|
||||
const uint32_t n_embd_out = model.hparams.n_embd_out();
|
||||
return embd.data + j*n_embd_out;
|
||||
} catch (const std::exception & err) {
|
||||
@@ -2761,6 +2719,7 @@ void llama_context::opt_init(struct llama_model * model, struct llama_opt_params
|
||||
llama_set_param(model->cls_b, param_filter, param_filter_ud);
|
||||
llama_set_param(model->cls_out, param_filter, param_filter_ud);
|
||||
llama_set_param(model->cls_out_b, param_filter, param_filter_ud);
|
||||
llama_set_param(model->cls_norm, param_filter, param_filter_ud);
|
||||
|
||||
for (struct llama_layer & layer : model->layers) {
|
||||
for (size_t i = 0; i < sizeof(layer)/sizeof(struct ggml_tensor *); ++i) {
|
||||
|
||||
@@ -185,7 +185,10 @@ bool llm_graph_input_out_ids::can_reuse(const llm_graph_params & params) {
|
||||
}
|
||||
|
||||
void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
|
||||
if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
||||
if (cparams.embeddings &&
|
||||
(cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN ||
|
||||
cparams.pooling_type == LLAMA_POOLING_TYPE_RANK )) {
|
||||
|
||||
const int64_t n_tokens = ubatch->n_tokens;
|
||||
const int64_t n_seq_tokens = ubatch->n_seq_tokens;
|
||||
const int64_t n_seqs_unq = ubatch->n_seqs_unq;
|
||||
@@ -1125,8 +1128,8 @@ ggml_tensor * llm_graph_context::build_ffn(
|
||||
|
||||
if (down) {
|
||||
cur = build_lora_mm(down, cur);
|
||||
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
|
||||
// GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
|
||||
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_JAIS2) {
|
||||
// GLM4, GLM4_MOE, and JAIS2 seem to have numerical issues with half-precision accumulators
|
||||
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
|
||||
}
|
||||
}
|
||||
@@ -1721,7 +1724,8 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
||||
|
||||
ggml_tensor * cur;
|
||||
|
||||
if (cparams.flash_attn && kq_b == nullptr) {
|
||||
const bool use_flash_attn = cparams.flash_attn && kq_b == nullptr;
|
||||
if (use_flash_attn) {
|
||||
GGML_ASSERT(kq_b == nullptr && "Flash attention does not support KQ bias yet");
|
||||
|
||||
if (v_trans) {
|
||||
@@ -1981,8 +1985,8 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
|
||||
if (wo) {
|
||||
cur = build_lora_mm(wo, cur);
|
||||
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
|
||||
// GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
|
||||
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_JAIS2) {
|
||||
// GLM4, GLM4_MOE, and JAIS2 seem to have numerical issues with half-precision accumulators
|
||||
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
|
||||
}
|
||||
}
|
||||
@@ -2414,8 +2418,9 @@ llm_graph_input_mem_hybrid_iswa * llm_graph_context::build_inp_mem_hybrid_iswa()
|
||||
|
||||
void llm_graph_context::build_dense_out(
|
||||
ggml_tensor * dense_2,
|
||||
ggml_tensor * dense_2_b,
|
||||
ggml_tensor * dense_3) const {
|
||||
if (!cparams.embeddings || !(dense_2 || dense_3)) {
|
||||
if (!cparams.embeddings || !(dense_2 || dense_2_b || dense_3)) {
|
||||
return;
|
||||
}
|
||||
ggml_tensor * cur = res->t_embd_pooled != nullptr ? res->t_embd_pooled : res->t_embd;
|
||||
@@ -2424,6 +2429,9 @@ void llm_graph_context::build_dense_out(
|
||||
if (dense_2) {
|
||||
cur = ggml_mul_mat(ctx0, dense_2, cur);
|
||||
}
|
||||
if (dense_2_b) {
|
||||
cur = ggml_add(ctx0, cur, dense_2_b);
|
||||
}
|
||||
if (dense_3) {
|
||||
cur = ggml_mul_mat(ctx0, dense_3, cur);
|
||||
}
|
||||
@@ -2437,7 +2445,8 @@ void llm_graph_context::build_pooling(
|
||||
ggml_tensor * cls,
|
||||
ggml_tensor * cls_b,
|
||||
ggml_tensor * cls_out,
|
||||
ggml_tensor * cls_out_b) const {
|
||||
ggml_tensor * cls_out_b,
|
||||
ggml_tensor * cls_norm) const {
|
||||
if (!cparams.embeddings) {
|
||||
return;
|
||||
}
|
||||
@@ -2476,8 +2485,15 @@ void llm_graph_context::build_pooling(
|
||||
} break;
|
||||
case LLAMA_POOLING_TYPE_RANK:
|
||||
{
|
||||
ggml_tensor * inp_cls = build_inp_cls();
|
||||
cur = ggml_get_rows(ctx0, inp, inp_cls);
|
||||
if (arch == LLM_ARCH_MODERN_BERT) {
|
||||
// modern bert gte reranker builds mean first then applies prediction head and classifier
|
||||
// https://github.com/huggingface/transformers/blob/main/src/transformers/models/modernbert/modular_modernbert.py#L1404-1411
|
||||
ggml_tensor * inp_mean = build_inp_mean();
|
||||
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, inp)), inp_mean);
|
||||
} else {
|
||||
ggml_tensor * inp_cls = build_inp_cls();
|
||||
cur = ggml_get_rows(ctx0, inp, inp_cls);
|
||||
}
|
||||
|
||||
// classification head
|
||||
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
|
||||
@@ -2486,7 +2502,15 @@ void llm_graph_context::build_pooling(
|
||||
if (cls_b) {
|
||||
cur = ggml_add(ctx0, cur, cls_b);
|
||||
}
|
||||
cur = ggml_tanh(ctx0, cur);
|
||||
if (arch == LLM_ARCH_MODERN_BERT) {
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
} else {
|
||||
cur = ggml_tanh(ctx0, cur);
|
||||
}
|
||||
if (cls_norm) {
|
||||
// head norm
|
||||
cur = build_norm(cur, cls_norm, NULL, LLM_NORM, -1);
|
||||
}
|
||||
}
|
||||
|
||||
// some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
||||
|
||||
@@ -1000,7 +1000,8 @@ struct llm_graph_context {
|
||||
ggml_tensor * cls,
|
||||
ggml_tensor * cls_b,
|
||||
ggml_tensor * cls_out,
|
||||
ggml_tensor * cls_out_b) const;
|
||||
ggml_tensor * cls_out_b,
|
||||
ggml_tensor * cls_norm) const;
|
||||
|
||||
//
|
||||
// sampling (backend sampling)
|
||||
@@ -1014,6 +1015,7 @@ struct llm_graph_context {
|
||||
|
||||
void build_dense_out(
|
||||
ggml_tensor * dense_2,
|
||||
ggml_tensor * dense_2_b,
|
||||
ggml_tensor * dense_3) const;
|
||||
};
|
||||
|
||||
|
||||
@@ -109,9 +109,9 @@ std::string llama_format_tensor_shape(const std::vector<int64_t> & ne) {
|
||||
|
||||
std::string llama_format_tensor_shape(const struct ggml_tensor * t) {
|
||||
char buf[256];
|
||||
snprintf(buf, sizeof(buf), "%5" PRId64, t->ne[0]);
|
||||
snprintf(buf, sizeof(buf), "%6" PRId64, t->ne[0]);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, t->ne[i]);
|
||||
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %6" PRId64, t->ne[i]);
|
||||
}
|
||||
return buf;
|
||||
}
|
||||
|
||||
@@ -271,6 +271,7 @@ void llama_model_saver::add_tensors_from_model() {
|
||||
add_tensor(model.cls_b);
|
||||
add_tensor(model.cls_out);
|
||||
add_tensor(model.cls_out_b);
|
||||
add_tensor(model.cls_norm);
|
||||
|
||||
for (const struct llama_layer & layer : model.layers) {
|
||||
for (size_t i = 0; i < sizeof(layer)/sizeof(struct ggml_tensor *); ++i) {
|
||||
|
||||
@@ -908,7 +908,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
||||
hparams.set_swa_pattern(swa_period);
|
||||
hparams.set_swa_pattern(swa_period, true);
|
||||
} else {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
}
|
||||
@@ -1937,6 +1937,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_JAIS2:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 32: type = LLM_TYPE_8B; break;
|
||||
case 68: type = LLM_TYPE_70B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_NEMOTRON:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
@@ -2234,7 +2244,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
} break;
|
||||
case LLM_ARCH_ERNIE4_5:
|
||||
case LLM_ARCH_ERNIE4_5_MOE:
|
||||
case LLM_ARCH_PADDLEOCR:
|
||||
{
|
||||
// paddleocr need mrope_section
|
||||
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
if (arch == LLM_ARCH_ERNIE4_5_MOE) {
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
@@ -2348,6 +2362,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
case 10752: type = LLM_TYPE_2_6B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
if (const auto is_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); is_swa && hparams.n_swa > 0) {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
for (uint32_t il = 0; il < hparams.n_layer; ++il) {
|
||||
hparams.swa_layers[il] = !hparams.recurrent_layer_arr[il];
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_LFM2MOE:
|
||||
{
|
||||
@@ -3513,9 +3533,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
}
|
||||
|
||||
cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
|
||||
cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
||||
cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
||||
cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
||||
cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
||||
cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
|
||||
cls_norm = create_tensor(tn(LLM_TENSOR_CLS_NORM, "weight"), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
} break;
|
||||
case LLM_ARCH_NEO_BERT:
|
||||
@@ -5368,6 +5389,45 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_JAIS2:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
// output
|
||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
||||
if (!output) {
|
||||
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = layers[i];
|
||||
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
|
||||
|
||||
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
||||
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
||||
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
||||
|
||||
// attention biases - all have shape n_embd (output dimension of projections)
|
||||
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
|
||||
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd}, 0);
|
||||
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd}, 0);
|
||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
|
||||
|
||||
// Jais-2 uses simple MLP (no gate) with biases
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
||||
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_CHATGLM:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
@@ -6575,6 +6635,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
} break;
|
||||
case LLM_ARCH_ERNIE4_5:
|
||||
case LLM_ARCH_ERNIE4_5_MOE:
|
||||
case LLM_ARCH_PADDLEOCR:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
@@ -6895,7 +6956,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
}
|
||||
|
||||
// for LFM2-ColBert-350M
|
||||
dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.n_embd_out()}, TENSOR_NOT_REQUIRED);
|
||||
dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.n_embd_out()}, TENSOR_NOT_REQUIRED);
|
||||
dense_2_out_layers_b = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "bias"), {hparams.n_embd_out() }, TENSOR_NOT_REQUIRED);
|
||||
} break;
|
||||
case LLM_ARCH_SMALLTHINKER:
|
||||
{
|
||||
@@ -8553,6 +8615,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
{
|
||||
llm = std::make_unique<llm_build_jais>(*this, params);
|
||||
} break;
|
||||
case LLM_ARCH_JAIS2:
|
||||
{
|
||||
llm = std::make_unique<llm_build_jais2>(*this, params);
|
||||
} break;
|
||||
case LLM_ARCH_NEMOTRON:
|
||||
{
|
||||
llm = std::make_unique<llm_build_nemotron>(*this, params);
|
||||
@@ -8648,6 +8714,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
{
|
||||
llm = std::make_unique<llm_build_ernie4_5_moe>(*this, params);
|
||||
} break;
|
||||
case LLM_ARCH_PADDLEOCR:
|
||||
{
|
||||
llm = std::make_unique<llm_build_paddleocr>(*this, params);
|
||||
} break;
|
||||
case LLM_ARCH_HUNYUAN_MOE:
|
||||
{
|
||||
llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
|
||||
@@ -8671,7 +8741,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
case LLM_ARCH_LFM2:
|
||||
case LLM_ARCH_LFM2MOE:
|
||||
{
|
||||
llm = std::make_unique<llm_build_lfm2>(*this, params);
|
||||
if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
|
||||
llm = std::make_unique<llm_build_lfm2<true>>(*this, params);
|
||||
} else {
|
||||
llm = std::make_unique<llm_build_lfm2<false>>(*this, params);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_SMALLTHINKER:
|
||||
{
|
||||
@@ -8734,7 +8808,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
}
|
||||
|
||||
// add on pooling layer
|
||||
llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
|
||||
llm->build_pooling(cls, cls_b, cls_out, cls_out_b, cls_norm);
|
||||
|
||||
// add backend sampling layers (if any)
|
||||
llm->build_sampling();
|
||||
@@ -8743,7 +8817,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
// there will be two additional dense projection layers
|
||||
// dense linear projections are applied after pooling
|
||||
// TODO: move reranking logic here and generalize
|
||||
llm->build_dense_out(dense_2_out_layers, dense_3_out_layers);
|
||||
llm->build_dense_out(dense_2_out_layers, dense_2_out_layers_b, dense_3_out_layers);
|
||||
|
||||
llm->res->set_outputs();
|
||||
|
||||
@@ -8961,6 +9035,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_BAILINGMOE2:
|
||||
case LLM_ARCH_DOTS1:
|
||||
case LLM_ARCH_HUNYUAN_MOE:
|
||||
case LLM_ARCH_JAIS2:
|
||||
case LLM_ARCH_OPENAI_MOE:
|
||||
case LLM_ARCH_HUNYUAN_DENSE:
|
||||
case LLM_ARCH_LFM2:
|
||||
@@ -8979,6 +9054,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
return LLAMA_ROPE_TYPE_NEOX;
|
||||
|
||||
case LLM_ARCH_QWEN2VL:
|
||||
case LLM_ARCH_PADDLEOCR:
|
||||
return LLAMA_ROPE_TYPE_MROPE;
|
||||
case LLM_ARCH_QWEN3VL:
|
||||
case LLM_ARCH_QWEN3VLMOE:
|
||||
|
||||
@@ -475,6 +475,7 @@ struct llama_model {
|
||||
struct ggml_tensor * cls_b = nullptr;
|
||||
struct ggml_tensor * cls_out = nullptr;
|
||||
struct ggml_tensor * cls_out_b = nullptr;
|
||||
struct ggml_tensor * cls_norm = nullptr;
|
||||
|
||||
struct ggml_tensor * conv1d = nullptr;
|
||||
struct ggml_tensor * conv1d_b = nullptr;
|
||||
@@ -491,8 +492,9 @@ struct llama_model {
|
||||
//Dense linear projections for SentenceTransformers models like embeddinggemma
|
||||
// For Sentence Transformers models structure see
|
||||
// https://sbert.net/docs/sentence_transformer/usage/custom_models.html#structure-of-sentence-transformer-models
|
||||
struct ggml_tensor * dense_2_out_layers = nullptr;
|
||||
struct ggml_tensor * dense_3_out_layers = nullptr;
|
||||
struct ggml_tensor * dense_2_out_layers = nullptr;
|
||||
struct ggml_tensor * dense_2_out_layers_b = nullptr;
|
||||
struct ggml_tensor * dense_3_out_layers = nullptr;
|
||||
|
||||
// gguf metadata
|
||||
std::unordered_map<std::string, std::string> gguf_kv;
|
||||
|
||||
@@ -479,6 +479,17 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
|
||||
return new_size;
|
||||
}
|
||||
|
||||
static bool tensor_type_requires_imatrix(const ggml_tensor * t, const ggml_type dst_type, const llama_ftype ftype) {
|
||||
return (
|
||||
dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS ||
|
||||
dst_type == GGML_TYPE_IQ3_XXS || dst_type == GGML_TYPE_IQ1_S ||
|
||||
dst_type == GGML_TYPE_IQ2_S || dst_type == GGML_TYPE_IQ1_M ||
|
||||
( // Q2_K_S is the worst k-quant type - only allow it without imatrix for token embeddings
|
||||
dst_type == GGML_TYPE_Q2_K && ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(t->name, "token_embd.weight") != 0
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
||||
ggml_type default_type;
|
||||
llama_ftype ftype = params->ftype;
|
||||
@@ -735,24 +746,36 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||
};
|
||||
|
||||
const auto tn = LLM_TN(model.arch);
|
||||
new_ofstream(0);
|
||||
|
||||
// no output file for --dry-run
|
||||
if (!params->dry_run) {
|
||||
new_ofstream(0);
|
||||
}
|
||||
|
||||
// flag for `--dry-run`, to let the user know if imatrix will be required for a real
|
||||
// quantization, as a courtesy
|
||||
bool will_require_imatrix = false;
|
||||
|
||||
for (const auto * it : tensors) {
|
||||
const auto & weight = *it;
|
||||
ggml_tensor * tensor = weight.tensor;
|
||||
if (weight.idx != cur_split && params->keep_split) {
|
||||
if (!params->dry_run && (weight.idx != cur_split && params->keep_split)) {
|
||||
close_ofstream();
|
||||
new_ofstream(weight.idx);
|
||||
}
|
||||
|
||||
const std::string name = ggml_get_name(tensor);
|
||||
const size_t tensor_size = ggml_nbytes(tensor);
|
||||
|
||||
if (!ml.use_mmap) {
|
||||
if (read_data.size() < ggml_nbytes(tensor)) {
|
||||
read_data.resize(ggml_nbytes(tensor));
|
||||
if (!params->dry_run) {
|
||||
if (!ml.use_mmap) {
|
||||
if (read_data.size() < tensor_size) {
|
||||
read_data.resize(tensor_size);
|
||||
}
|
||||
tensor->data = read_data.data();
|
||||
}
|
||||
tensor->data = read_data.data();
|
||||
ml.load_data_for(tensor);
|
||||
}
|
||||
ml.load_data_for(tensor);
|
||||
|
||||
LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
|
||||
++idx, ml.n_tensors,
|
||||
@@ -900,129 +923,155 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||
quantize = tensor->type != new_type;
|
||||
}
|
||||
|
||||
if (!quantize) {
|
||||
new_type = tensor->type;
|
||||
new_data = tensor->data;
|
||||
new_size = ggml_nbytes(tensor);
|
||||
LLAMA_LOG_INFO("size = %8.3f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
||||
} else {
|
||||
const int64_t nelements = ggml_nelements(tensor);
|
||||
|
||||
const float * imatrix = nullptr;
|
||||
if (imatrix_data) {
|
||||
auto it = imatrix_data->find(remap_imatrix(tensor->name, mapped));
|
||||
if (it == imatrix_data->end()) {
|
||||
LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
|
||||
} else {
|
||||
if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
|
||||
imatrix = it->second.data();
|
||||
} else {
|
||||
LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
|
||||
int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
|
||||
|
||||
// this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
|
||||
// this is a significant error and it may be good idea to abort the process if this happens,
|
||||
// since many people will miss the error and not realize that most of the model is being quantized without an imatrix
|
||||
// tok_embd should be ignored in this case, since it always causes this warning
|
||||
if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
|
||||
throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
|
||||
int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
|
||||
}
|
||||
}
|
||||
// we have now decided on the target type for this tensor
|
||||
if (params->dry_run) {
|
||||
// the --dry-run option calculates the final quantization size without quantizting
|
||||
if (quantize) {
|
||||
new_size = ggml_nrows(tensor) * ggml_row_size(new_type, tensor->ne[0]);
|
||||
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB (%s)\n",
|
||||
tensor_size/1024.0/1024.0,
|
||||
new_size/1024.0/1024.0,
|
||||
ggml_type_name(new_type));
|
||||
if (!will_require_imatrix && tensor_type_requires_imatrix(tensor, new_type, params->ftype)) {
|
||||
will_require_imatrix = true;
|
||||
}
|
||||
}
|
||||
if ((new_type == GGML_TYPE_IQ2_XXS ||
|
||||
new_type == GGML_TYPE_IQ2_XS ||
|
||||
new_type == GGML_TYPE_IQ2_S ||
|
||||
new_type == GGML_TYPE_IQ1_S ||
|
||||
(new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) ||
|
||||
(new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
|
||||
LLAMA_LOG_ERROR("\n\n============================================================\n");
|
||||
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
|
||||
LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
|
||||
LLAMA_LOG_ERROR("============================================================\n\n");
|
||||
throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
|
||||
}
|
||||
|
||||
float * f32_data;
|
||||
|
||||
if (tensor->type == GGML_TYPE_F32) {
|
||||
f32_data = (float *) tensor->data;
|
||||
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
|
||||
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
|
||||
} else {
|
||||
llama_tensor_dequantize_impl(tensor, f32_conv_buf, workers, nelements, nthread);
|
||||
f32_data = (float *) f32_conv_buf.data();
|
||||
new_size = tensor_size;
|
||||
LLAMA_LOG_INFO("size = %8.3f MiB\n", new_size/1024.0/1024.0);
|
||||
}
|
||||
total_size_org += tensor_size;
|
||||
total_size_new += new_size;
|
||||
continue;
|
||||
} else {
|
||||
// no --dry-run, perform quantization
|
||||
if (!quantize) {
|
||||
new_type = tensor->type;
|
||||
new_data = tensor->data;
|
||||
new_size = tensor_size;
|
||||
LLAMA_LOG_INFO("size = %8.3f MiB\n", tensor_size/1024.0/1024.0);
|
||||
} else {
|
||||
const int64_t nelements = ggml_nelements(tensor);
|
||||
|
||||
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
|
||||
fflush(stdout);
|
||||
const float * imatrix = nullptr;
|
||||
if (imatrix_data) {
|
||||
auto it = imatrix_data->find(remap_imatrix(tensor->name, mapped));
|
||||
if (it == imatrix_data->end()) {
|
||||
LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
|
||||
} else {
|
||||
if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
|
||||
imatrix = it->second.data();
|
||||
} else {
|
||||
LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
|
||||
int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
|
||||
|
||||
if (work.size() < (size_t)nelements * 4) {
|
||||
work.resize(nelements * 4); // upper bound on size
|
||||
}
|
||||
new_data = work.data();
|
||||
|
||||
const int64_t n_per_row = tensor->ne[0];
|
||||
const int64_t nrows = tensor->ne[1];
|
||||
|
||||
static const int64_t min_chunk_size = 32 * 512;
|
||||
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
|
||||
|
||||
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
||||
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
||||
const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
|
||||
|
||||
// quantize each expert separately since they have different importance matrices
|
||||
new_size = 0;
|
||||
for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
|
||||
const float * f32_data_03 = f32_data + i03 * nelements_matrix;
|
||||
void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
|
||||
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
|
||||
|
||||
new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
|
||||
|
||||
// TODO: temporary sanity check that the F16 -> MXFP4 is lossless
|
||||
#if 0
|
||||
if (new_type == GGML_TYPE_MXFP4) {
|
||||
auto * x = f32_data_03;
|
||||
|
||||
//LLAMA_LOG_INFO("nrows = %d, n_per_row = %d\n", nrows, n_per_row);
|
||||
std::vector<float> deq(nrows*n_per_row);
|
||||
const ggml_type_traits * qtype = ggml_get_type_traits(new_type);
|
||||
qtype->to_float(new_data_03, deq.data(), deq.size());
|
||||
|
||||
double err = 0.0f;
|
||||
for (int i = 0; i < (int) deq.size(); ++i) {
|
||||
err += fabsf(deq[i] - x[i]);
|
||||
//if (fabsf(deq[i] - x[i]) > 0.00001 && i < 256) {
|
||||
if (deq[i] != x[i]) {
|
||||
LLAMA_LOG_INFO("deq[%d] = %f, x[%d] = %f\n", i, deq[i], i, x[i]);
|
||||
// this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
|
||||
// this is a significant error and it may be good idea to abort the process if this happens,
|
||||
// since many people will miss the error and not realize that most of the model is being quantized without an imatrix
|
||||
// tok_embd should be ignored in this case, since it always causes this warning
|
||||
if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
|
||||
throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
|
||||
int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
|
||||
}
|
||||
}
|
||||
}
|
||||
//LLAMA_LOG_INFO("err = %f\n", err);
|
||||
GGML_ASSERT(err == 0.00000);
|
||||
}
|
||||
if (!imatrix && tensor_type_requires_imatrix(tensor, new_type, params->ftype)) {
|
||||
LLAMA_LOG_ERROR("\n\n============================================================\n");
|
||||
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
|
||||
LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
|
||||
LLAMA_LOG_ERROR("============================================================\n\n");
|
||||
throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
|
||||
}
|
||||
|
||||
float * f32_data;
|
||||
|
||||
if (tensor->type == GGML_TYPE_F32) {
|
||||
f32_data = (float *) tensor->data;
|
||||
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
|
||||
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
|
||||
} else {
|
||||
llama_tensor_dequantize_impl(tensor, f32_conv_buf, workers, nelements, nthread);
|
||||
f32_data = (float *) f32_conv_buf.data();
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
|
||||
fflush(stdout);
|
||||
|
||||
if (work.size() < (size_t)nelements * 4) {
|
||||
work.resize(nelements * 4); // upper bound on size
|
||||
}
|
||||
new_data = work.data();
|
||||
|
||||
const int64_t n_per_row = tensor->ne[0];
|
||||
const int64_t nrows = tensor->ne[1];
|
||||
|
||||
static const int64_t min_chunk_size = 32 * 512;
|
||||
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
|
||||
|
||||
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
||||
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
||||
const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
|
||||
|
||||
// quantize each expert separately since they have different importance matrices
|
||||
new_size = 0;
|
||||
for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
|
||||
const float * f32_data_03 = f32_data + i03 * nelements_matrix;
|
||||
void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
|
||||
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
|
||||
|
||||
new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
|
||||
|
||||
// TODO: temporary sanity check that the F16 -> MXFP4 is lossless
|
||||
#if 0
|
||||
if (new_type == GGML_TYPE_MXFP4) {
|
||||
auto * x = f32_data_03;
|
||||
|
||||
//LLAMA_LOG_INFO("nrows = %d, n_per_row = %d\n", nrows, n_per_row);
|
||||
std::vector<float> deq(nrows*n_per_row);
|
||||
const ggml_type_traits * qtype = ggml_get_type_traits(new_type);
|
||||
qtype->to_float(new_data_03, deq.data(), deq.size());
|
||||
|
||||
double err = 0.0f;
|
||||
for (int i = 0; i < (int) deq.size(); ++i) {
|
||||
err += fabsf(deq[i] - x[i]);
|
||||
//if (fabsf(deq[i] - x[i]) > 0.00001 && i < 256) {
|
||||
if (deq[i] != x[i]) {
|
||||
LLAMA_LOG_INFO("deq[%d] = %f, x[%d] = %f\n", i, deq[i], i, x[i]);
|
||||
}
|
||||
}
|
||||
//LLAMA_LOG_INFO("err = %f\n", err);
|
||||
GGML_ASSERT(err == 0.00000);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", tensor_size/1024.0/1024.0, new_size/1024.0/1024.0);
|
||||
}
|
||||
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
||||
}
|
||||
total_size_org += ggml_nbytes(tensor);
|
||||
total_size_new += new_size;
|
||||
total_size_org += tensor_size;
|
||||
total_size_new += new_size;
|
||||
|
||||
// update the gguf meta data as we go
|
||||
gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
|
||||
GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
|
||||
gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
|
||||
// update the gguf meta data as we go
|
||||
gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
|
||||
GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
|
||||
gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
|
||||
|
||||
// write tensor data + padding
|
||||
fout.write((const char *) new_data, new_size);
|
||||
zeros(fout, GGML_PAD(new_size, align) - new_size);
|
||||
// write tensor data + padding
|
||||
fout.write((const char *) new_data, new_size);
|
||||
zeros(fout, GGML_PAD(new_size, align) - new_size);
|
||||
} // no --dry-run
|
||||
} // iterate over tensors
|
||||
|
||||
if (!params->dry_run) {
|
||||
close_ofstream();
|
||||
}
|
||||
close_ofstream();
|
||||
|
||||
LLAMA_LOG_INFO("%s: model size = %8.2f MiB\n", __func__, total_size_org/1024.0/1024.0);
|
||||
LLAMA_LOG_INFO("%s: quant size = %8.2f MiB\n", __func__, total_size_new/1024.0/1024.0);
|
||||
LLAMA_LOG_INFO("%s: model size = %8.2f MiB (%.2f BPW)\n", __func__, total_size_org/1024.0/1024.0, total_size_org*8.0/ml.n_elements);
|
||||
LLAMA_LOG_INFO("%s: quant size = %8.2f MiB (%.2f BPW)\n", __func__, total_size_new/1024.0/1024.0, total_size_new*8.0/ml.n_elements);
|
||||
|
||||
if (!params->imatrix && params->dry_run && will_require_imatrix) {
|
||||
LLAMA_LOG_WARN("%s: WARNING: dry run completed successfully, but actually completing this quantization will require an imatrix!\n",
|
||||
__func__
|
||||
);
|
||||
}
|
||||
|
||||
if (qs.n_fallback > 0) {
|
||||
LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
|
||||
@@ -1045,6 +1094,7 @@ llama_model_quantize_params llama_model_quantize_default_params() {
|
||||
/*.only_copy =*/ false,
|
||||
/*.pure =*/ false,
|
||||
/*.keep_split =*/ false,
|
||||
/*.dry_run =*/ false,
|
||||
/*.imatrix =*/ nullptr,
|
||||
/*.kv_overrides =*/ nullptr,
|
||||
/*.tensor_type =*/ nullptr,
|
||||
|
||||
@@ -289,6 +289,15 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_JAIS2:
|
||||
regex_exprs = {
|
||||
// original regex from tokenizer.json
|
||||
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s{512}(?!\\S)|\\s{256}(?!\\S)|\\s{128}(?!\\S)|\\s{64}(?!\\S)|\\s{32}(?!\\S)|\\s{16}(?!\\S)|\\s{8}(?!\\S)|\\s{4}(?!\\S)|\\s{1,2}(?!\\S)|\\s{1}",
|
||||
|
||||
// adapted: same as llama3 but with cascading whitespace pattern
|
||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s{512}(?!\\S)|\\s{256}(?!\\S)|\\s{128}(?!\\S)|\\s{64}(?!\\S)|\\s{32}(?!\\S)|\\s{16}(?!\\S)|\\s{8}(?!\\S)|\\s{4}(?!\\S)|\\s{1,2}(?!\\S)|\\s{1}",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_DBRX:
|
||||
case LLAMA_VOCAB_PRE_TYPE_SMAUG:
|
||||
regex_exprs = {
|
||||
@@ -1921,8 +1930,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
tokenizer_pre == "jina-v2-de" ||
|
||||
tokenizer_pre == "a.x-4.0" ||
|
||||
tokenizer_pre == "mellum" ||
|
||||
tokenizer_pre == "modern-bert" ) {
|
||||
tokenizer_pre == "modern-bert") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
||||
} else if (
|
||||
tokenizer_pre == "jais-2") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_JAIS2;
|
||||
} else if (
|
||||
tokenizer_pre == "jina-v1-en" ||
|
||||
tokenizer_pre == "jina-v2-code" ||
|
||||
@@ -2458,6 +2470,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|| t.first == "<|calls|>" // solar-open
|
||||
|| t.first == "<end_of_turn>"
|
||||
|| t.first == "<|endoftext|>"
|
||||
|| t.first == "</s>" // paddleocr
|
||||
|| t.first == "<|eom_id|>"
|
||||
|| t.first == "<EOT>"
|
||||
|| t.first == "_<EOT>"
|
||||
|
||||
@@ -57,6 +57,7 @@ enum llama_vocab_pre_type {
|
||||
LLAMA_VOCAB_PRE_TYPE_QWEN35 = 46,
|
||||
LLAMA_VOCAB_PRE_TYPE_TINY_AYA = 47,
|
||||
LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48,
|
||||
LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49,
|
||||
};
|
||||
|
||||
struct LLM_KV;
|
||||
|
||||
@@ -27,6 +27,7 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
|
||||
|
||||
const int64_t S_v = v->ne[0];
|
||||
const int64_t H_v = v->ne[1];
|
||||
const bool kda = (g->ne[0] == S_k && g->ne[1] == H_k);
|
||||
|
||||
GGML_ASSERT(S_k == S_v);
|
||||
GGML_ASSERT(H_v % H_k == 0);
|
||||
@@ -35,9 +36,10 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
|
||||
GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
|
||||
GGML_ASSERT(v->ne[0] == S_v && v->ne[1] == H_v && v->ne[2] == n_tokens && v->ne[3] == n_seqs);
|
||||
|
||||
GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
|
||||
GGML_ASSERT(b->ne[0] == H_v && b->ne[2] == n_tokens && b->ne[3] == n_seqs);
|
||||
GGML_ASSERT(s->ne[0] == S_v && s->ne[1] == S_v && s->ne[2] == H_v && s->ne[3] == n_seqs);
|
||||
GGML_ASSERT(g->ne[0] == 1 || g->ne[0] == S_v);
|
||||
GGML_ASSERT( g->ne[1] == H_v && g->ne[2] == n_tokens && g->ne[3] == n_seqs);
|
||||
GGML_ASSERT(b->ne[0] == 1 && b->ne[1] == H_v && b->ne[2] == n_tokens && b->ne[3] == n_seqs);
|
||||
GGML_ASSERT(s->ne[0] == S_v && s->ne[1] == S_v && s->ne[2] == H_v && s->ne[3] == n_seqs);
|
||||
|
||||
const float scale = 1.0f / sqrtf(S_k);
|
||||
|
||||
@@ -52,8 +54,8 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
|
||||
q = ggml_permute(ctx0, q, 0, 2, 1, 3); // [S_k, n_tokens, H_k, n_seqs]
|
||||
k = ggml_permute(ctx0, k, 0, 2, 1, 3); // [S_k, n_tokens, H_k, n_seqs]
|
||||
v = ggml_permute(ctx0, v, 0, 2, 1, 3); // [S_v, n_tokens, H_v, n_seqs]
|
||||
g = ggml_permute(ctx0, g, 2, 1, 3, 0); // [ 1, n_tokens, H_v, n_seqs]
|
||||
b = ggml_permute(ctx0, b, 2, 0, 1, 3); // [ 1, n_tokens, H_v, n_seqs]
|
||||
g = ggml_permute(ctx0, g, 0, 2, 1, 3); // [g_0, n_tokens, H_v, n_seqs]
|
||||
b = ggml_permute(ctx0, b, 0, 2, 1, 3); // [ 1, n_tokens, H_v, n_seqs]
|
||||
|
||||
const int CS = CHUNK_SIZE;
|
||||
|
||||
@@ -78,33 +80,76 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
|
||||
v = ggml_reshape_4d(ctx0, v, S_v, CS, n_chunks, H_v * n_seqs);
|
||||
v_b = ggml_reshape_4d(ctx0, v_b, S_v, CS, n_chunks, H_v * n_seqs);
|
||||
|
||||
g = ggml_reshape_4d(ctx0, g, CS, 1, n_chunks, H_v * n_seqs);
|
||||
b = ggml_reshape_4d(ctx0, b, 1, CS, n_chunks, H_v * n_seqs);
|
||||
g = ggml_reshape_4d(ctx0, g, g->ne[0], CS, n_chunks, H_v * n_seqs);
|
||||
b = ggml_reshape_4d(ctx0, b, 1, CS, n_chunks, H_v * n_seqs);
|
||||
|
||||
// [CS, 1, n_chunks, H_v * n_seqs]
|
||||
ggml_tensor * g_cs = ggml_cumsum(ctx0, g);
|
||||
// [CS, g_0, n_chunks, H_v * n_seqs]
|
||||
// TODO: extend ggml_cumsum with axis parameter to avoid transpose
|
||||
ggml_tensor * g_cs = ggml_cumsum(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, g)));
|
||||
cb(g_cs, "g_cs", il);
|
||||
|
||||
ggml_tensor * g_cs_i = g_cs;
|
||||
ggml_tensor * g_cs_j = ggml_reshape_4d(ctx0, g_cs, 1, CS, n_chunks, H_v * n_seqs);
|
||||
ggml_tensor * kb = nullptr;
|
||||
ggml_tensor * kq = nullptr;
|
||||
if (kda) {
|
||||
const int64_t CHB = n_chunks * H_k * n_seqs;
|
||||
|
||||
g_cs_j = ggml_repeat_4d(ctx0, g_cs_j, CS, CS, n_chunks, H_v * n_seqs);
|
||||
ggml_tensor * g_cs_i = ggml_reshape_4d(ctx0, g_cs, CS, 1, S_k, CHB); // [chunk_size, 1, S_k, CHB]
|
||||
ggml_tensor * g_cs_j = ggml_reshape_4d(ctx0, g_cs, 1, CS, S_k, CHB); // [1, chunk_size, S_k, CHB]
|
||||
|
||||
// [CS, CS, n_chunks, H_v * n_seqs]
|
||||
ggml_tensor * decay_mask;
|
||||
decay_mask = ggml_sub(ctx0, g_cs_j, g_cs_i);
|
||||
decay_mask = ggml_tri(ctx0, decay_mask, GGML_TRI_TYPE_LOWER_DIAG);
|
||||
decay_mask = ggml_exp(ctx0, decay_mask);
|
||||
cb(decay_mask, "decay_mask", il);
|
||||
g_cs_j = ggml_repeat_4d(ctx0, g_cs_j, CS, CS, S_k, CHB); // [1, chunk_size, S_k, CHB] -> [chunk_size, chunk_size, S_k, CHB]
|
||||
|
||||
// [CS, CS, n_chunks, H_k * n_seqs]
|
||||
ggml_tensor * kb;
|
||||
kb = ggml_mul_mat(ctx0, k, k_b);
|
||||
kb = ggml_mul (ctx0, kb, decay_mask);
|
||||
// decay_mask [chunk_size,chunk_size,S_k,CHB]
|
||||
ggml_tensor * decay_mask;
|
||||
decay_mask = ggml_sub(ctx0, g_cs_j, g_cs_i);
|
||||
decay_mask = ggml_tri(ctx0, decay_mask, GGML_TRI_TYPE_LOWER_DIAG);
|
||||
decay_mask = ggml_exp(ctx0, decay_mask);
|
||||
cb(decay_mask, "decay_mask", il);
|
||||
|
||||
// decay_mask [S_k,BT_j,BT_i,CHB] *Note* second and third chunk_sizes are switched
|
||||
decay_mask = ggml_cont_4d(ctx0, ggml_permute(ctx0, decay_mask, 2, 1, 0, 3), S_k, CS, CS, CHB);
|
||||
|
||||
ggml_tensor * k_b_i = ggml_reshape_4d(ctx0, k_b, S_k, CS, 1, CHB);
|
||||
ggml_tensor * k_j = ggml_reshape_4d(ctx0, k, S_k, 1, CS, CHB);
|
||||
ggml_tensor * q_i = ggml_reshape_4d(ctx0, q, S_k, CS, 1, CHB);
|
||||
|
||||
ggml_tensor * decay_k_b_i = ggml_mul(ctx0, decay_mask, k_b_i);
|
||||
ggml_tensor * decay_q_i = ggml_mul(ctx0, decay_mask, q_i);
|
||||
|
||||
// decay_k_b_i [S,BT,BT,CHB] @ k_j [S,1,BT,CHB] = Akk [BT,1,BT,CHB]
|
||||
kb = ggml_mul_mat(ctx0, decay_k_b_i, k_j);
|
||||
kq = ggml_mul_mat(ctx0, decay_q_i, k_j);
|
||||
|
||||
kb = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_4d(ctx0, kb, CS, CS, n_chunks, H_v * n_seqs)));
|
||||
kq = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_4d(ctx0, kq, CS, CS, n_chunks, H_v * n_seqs)));
|
||||
} else {
|
||||
ggml_tensor * g_cs_i = g_cs;
|
||||
ggml_tensor * g_cs_j = ggml_reshape_4d(ctx0, g_cs, 1, CS, n_chunks, H_v * n_seqs);
|
||||
|
||||
g_cs_j = ggml_repeat_4d(ctx0, g_cs_j, CS, CS, n_chunks, H_v * n_seqs);
|
||||
|
||||
// [CS, CS, n_chunks, H_v * n_seqs]
|
||||
ggml_tensor * decay_mask;
|
||||
decay_mask = ggml_sub(ctx0, g_cs_j, g_cs_i);
|
||||
decay_mask = ggml_tri(ctx0, decay_mask, GGML_TRI_TYPE_LOWER_DIAG);
|
||||
decay_mask = ggml_exp(ctx0, decay_mask);
|
||||
cb(decay_mask, "decay_mask", il);
|
||||
|
||||
// [CS, CS, n_chunks, H_k * n_seqs]
|
||||
kb = ggml_mul_mat(ctx0, k, k_b);
|
||||
kb = ggml_mul (ctx0, kb, decay_mask);
|
||||
|
||||
// [CS, CS, n_chunks, H_k * n_seqs]
|
||||
kq = ggml_mul_mat(ctx0, k, q);
|
||||
kq = ggml_mul(ctx0, kq, decay_mask);
|
||||
}
|
||||
|
||||
kq = ggml_tri(ctx0, kq, GGML_TRI_TYPE_LOWER_DIAG);
|
||||
cb(kq, "kq", il);
|
||||
|
||||
// [CS, CS, n_chunks, H_k * n_seqs]
|
||||
ggml_tensor * attn;
|
||||
attn = ggml_tri(ctx0, kb, GGML_TRI_TYPE_LOWER);
|
||||
cb(attn, "attn", il);
|
||||
|
||||
ggml_tensor * identity;
|
||||
identity = ggml_view_1d(ctx0, attn, CS, 0);
|
||||
@@ -115,6 +160,7 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
|
||||
cb(lhs, "dnet_add_ch_lhs", il);
|
||||
|
||||
attn = ggml_neg(ctx0, attn);
|
||||
cb(attn, "attn_pre_solve", il);
|
||||
|
||||
ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
|
||||
attn = ggml_add(ctx0, lin_solve, identity);
|
||||
@@ -123,7 +169,7 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
|
||||
// [S_v, CS, n_chunks, H_v * n_seqs]
|
||||
v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_b)), attn);
|
||||
|
||||
// [CS, 1, n_chunks, H_v * n_seqs]
|
||||
// [CS, 1, n_chunks, H_v * n_seqs] KDA: [CS, S_k, n_chunks, H_v * n_seqs]
|
||||
ggml_tensor * g_exp = ggml_exp(ctx0, g_cs);
|
||||
|
||||
k_b = ggml_cont(ctx0, ggml_transpose(ctx0, k_b));
|
||||
@@ -136,16 +182,10 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
|
||||
ggml_tensor * k_cd = ggml_mul_mat(ctx0, kbg, attn);
|
||||
cb(k_cd, "k_cumdecay", il);
|
||||
|
||||
// [S_k, CS, n_chunks, H_k * n_seqs]
|
||||
ggml_tensor * g_exp_t = ggml_transpose(ctx0, g_exp);
|
||||
// [1, CS, n_chunks, H_k * n_seqs] KDA: [S_k, CS, n_chunks, H_k * n_seqs]
|
||||
ggml_tensor * g_exp_t = ggml_cont(ctx0, ggml_transpose(ctx0, g_exp));
|
||||
ggml_tensor * q_g_exp = ggml_mul(ctx0, q, g_exp_t);
|
||||
|
||||
// [CS, CS, n_chunks, H_k * n_seqs]
|
||||
ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
|
||||
kq = ggml_mul(ctx0, kq, decay_mask);
|
||||
kq = ggml_tri(ctx0, kq, GGML_TRI_TYPE_LOWER_DIAG);
|
||||
cb(kq, "kq", il);
|
||||
|
||||
// vectorized calculation of key_gdiff
|
||||
// improved from the chunked version:
|
||||
// g_last = torch.clamp(g_cum[:, :, -1], max=50.0).exp().unsqueeze(-1).unsqueeze(-1)
|
||||
@@ -156,8 +196,8 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
|
||||
|
||||
// get last element in g_cumsum along CS dimension (ne0)
|
||||
// example: [[x, y, z, ..., last], ...] -> [[last], ...]
|
||||
// [1, 1, n_chunks, H_v * n_seqs]
|
||||
ggml_tensor * g_last = ggml_view_4d(ctx0, g_cs, 1, 1, g_cs->ne[2], g_cs->ne[3],
|
||||
// [1, 1, n_chunks, H_v * n_seqs] KDA: [1, S_k, n_chunks, H_v * n_seqs]
|
||||
ggml_tensor * g_last = ggml_view_4d(ctx0, g_cs, 1, g_cs->ne[1], g_cs->ne[2], g_cs->ne[3],
|
||||
g_cs->nb[1],
|
||||
g_cs->nb[2],
|
||||
g_cs->nb[3],
|
||||
@@ -167,16 +207,15 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
|
||||
// TODO: remove this cont when CUDA supports non-cont unary ops
|
||||
g_last = ggml_cont(ctx0, g_last);
|
||||
|
||||
// [1, 1, n_chunks, H_v * n_seqs]
|
||||
ggml_tensor * g_last_exp = ggml_exp(ctx0, g_last);
|
||||
cb(g_last_exp, "g_last_exp", il);
|
||||
// [1, 1, n_chunks, H_v * n_seqs] KDA: [S_k, 1, n_chunks, H_v * n_seqs]
|
||||
ggml_tensor * g_last_exp_t = ggml_transpose(ctx0, ggml_exp(ctx0, g_last));
|
||||
cb(g_last_exp_t, "g_last_exp_t", il);
|
||||
|
||||
// [CS, 1, n_chunks, H_v * n_seqs]
|
||||
// [CS, 1, n_chunks, H_v * n_seqs] KDA: [CS, S_k, n_chunks, H_v * n_seqs]
|
||||
ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cs, g_last));
|
||||
cb(g_diff, "g_diff", il);
|
||||
|
||||
ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff);
|
||||
ggml_tensor * g_diff_exp_t = ggml_transpose(ctx0, g_diff_exp);
|
||||
ggml_tensor * g_diff_exp_t = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_exp(ctx0, g_diff)));
|
||||
|
||||
// [S_k, CS, n_chunks, H_v * n_seqs]
|
||||
ggml_tensor * kg = ggml_mul(ctx0, k, g_diff_exp_t);
|
||||
@@ -227,8 +266,9 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
|
||||
ggml_tensor * kgv = ggml_mul_mat(ctx0, ch_kg_t, v_t_new); // [S_k, S_v, 1, H_k * n_seqs]
|
||||
|
||||
// last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
|
||||
ggml_tensor * ch_g_last_exp = get_slice_2d(ctx0, g_last_exp, chunk);
|
||||
s_t = ggml_mul(ctx0, s_t, ch_g_last_exp);
|
||||
ggml_tensor * ch_g_last_exp_t = get_slice_2d(ctx0, g_last_exp_t, chunk);
|
||||
|
||||
s_t = ggml_mul(ctx0, s_t, ch_g_last_exp_t);
|
||||
s_t = ggml_add(ctx0, s_t, kgv);
|
||||
cb(s_t, "dnet_add_ch_state", il);
|
||||
}
|
||||
@@ -241,9 +281,9 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
|
||||
ggml_row_size(v->type, S_v),
|
||||
ggml_row_size(v->type, S_v * CS * n_chunks),
|
||||
ggml_row_size(v->type, S_v * CS * n_chunks * H_v), 0);
|
||||
|
||||
o = ggml_permute (ctx0, o, 0, 2, 1, 3); // [S_v, H_v, n_tokens, n_seqs]
|
||||
s = ggml_transpose(ctx0, s_t); // [S_v, S_v, H_v, n_seqs]
|
||||
s = ggml_transpose(ctx0, s_t);
|
||||
cb(s, "output_state", il);
|
||||
|
||||
return {o, s};
|
||||
}
|
||||
@@ -273,9 +313,10 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
|
||||
GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
|
||||
GGML_ASSERT(v->ne[0] == S_v && v->ne[1] == H_v && v->ne[2] == n_tokens && v->ne[3] == n_seqs);
|
||||
|
||||
GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
|
||||
GGML_ASSERT(b->ne[0] == H_v && b->ne[2] == n_tokens && b->ne[3] == n_seqs);
|
||||
GGML_ASSERT(s->ne[0] == S_v && s->ne[1] == S_v && s->ne[2] == H_v && s->ne[3] == n_seqs);
|
||||
GGML_ASSERT(g->ne[0] == 1 || g->ne[0] == S_v);
|
||||
GGML_ASSERT( g->ne[1] == H_v && g->ne[2] == n_tokens && g->ne[3] == n_seqs);
|
||||
GGML_ASSERT(b->ne[0] == 1 && b->ne[1] == H_v && b->ne[2] == n_tokens && b->ne[3] == n_seqs);
|
||||
GGML_ASSERT(s->ne[0] == S_v && s->ne[1] == S_v && s->ne[2] == H_v && s->ne[3] == n_seqs);
|
||||
|
||||
const float scale = 1.0f / sqrtf(S_k);
|
||||
|
||||
@@ -291,8 +332,10 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
|
||||
cb(b, "b_in", il);
|
||||
cb(g, "g_in", il);
|
||||
|
||||
g = ggml_reshape_4d(ctx0, g, 1, 1, H_v, n_seqs);
|
||||
b = ggml_reshape_4d(ctx0, b, 1, 1, H_v, n_seqs);
|
||||
// GDA: [1, 1, H_v, n_seqs]
|
||||
// KDA: [1, S_k, H_v, n_seqs]
|
||||
g = ggml_reshape_4d(ctx0, g, 1, g->ne[0], H_v, n_seqs);
|
||||
b = ggml_reshape_4d(ctx0, b, 1, 1, H_v, n_seqs);
|
||||
|
||||
// [S_v, S_v, H_v, n_seqs]
|
||||
g = ggml_exp(ctx0, g);
|
||||
|
||||
123
src/models/jais2.cpp
Normal file
123
src/models/jais2.cpp
Normal file
@@ -0,0 +1,123 @@
|
||||
#include "models.h"
|
||||
|
||||
// JAIS-2 model graph builder
|
||||
// Uses: LayerNorm (not RMSNorm), relu2 activation, separate Q/K/V, RoPE embeddings
|
||||
llm_build_jais2::llm_build_jais2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
// KV input for attention
|
||||
auto * inp_attn = build_attn_inp_kv();
|
||||
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
// Pre-attention LayerNorm
|
||||
cur = build_norm(inpL,
|
||||
model.layers[il].attn_norm,
|
||||
model.layers[il].attn_norm_b,
|
||||
LLM_NORM, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// Self-attention with separate Q, K, V projections
|
||||
{
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
cb(Qcur, "Qcur_bias", il);
|
||||
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
cb(Kcur, "Kcur_bias", il);
|
||||
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
cb(Vcur, "Vcur_bias", il);
|
||||
|
||||
// Reshape for attention
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
|
||||
// Apply RoPE
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, Kcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
cb(Qcur, "Qcur_rope", il);
|
||||
cb(Kcur, "Kcur_rope", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
||||
}
|
||||
|
||||
// Residual connection
|
||||
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// Pre-FFN LayerNorm
|
||||
cur = build_norm(ffn_inp,
|
||||
model.layers[il].ffn_norm,
|
||||
model.layers[il].ffn_norm_b,
|
||||
LLM_NORM, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
// FFN with relu2 activation (ReLU squared) - no gate projection
|
||||
// up -> relu2 -> down
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
||||
NULL, NULL, NULL, // no gate
|
||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
||||
NULL,
|
||||
LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
// Residual connection
|
||||
inpL = ggml_add(ctx0, cur, ffn_inp);
|
||||
inpL = build_cvec(inpL, il);
|
||||
cb(inpL, "l_out", il);
|
||||
}
|
||||
|
||||
// Final LayerNorm
|
||||
cur = build_norm(inpL,
|
||||
model.output_norm,
|
||||
model.output_norm_b,
|
||||
LLM_NORM, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
res->t_embd = cur;
|
||||
|
||||
// Output projection
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
res->t_logits = cur;
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
@@ -3,8 +3,6 @@
|
||||
|
||||
#include "llama-memory-recurrent.h"
|
||||
|
||||
#define CHUNK_SIZE 64
|
||||
|
||||
// Causal Conv1d function for Q,K,V
|
||||
// When qkv is 0, it is Q, 1 is K, 2 is V
|
||||
static ggml_tensor * causal_conv1d(ggml_cgraph * gf, ggml_context * ctx0, ggml_tensor * conv_states_all, ggml_tensor * conv_state_all, int64_t qkv, ggml_tensor * x, ggml_tensor * proj_w, ggml_tensor * conv_w, int64_t d_conv, int64_t head_dim, int64_t n_head, int64_t n_seq_tokens, int64_t n_seqs, int64_t n_tokens, int64_t kv_head) {
|
||||
@@ -67,7 +65,7 @@ static ggml_tensor * causal_conv1d(ggml_cgraph * gf, ggml_context * ctx0, ggml_t
|
||||
}
|
||||
|
||||
llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_build_mamba_base(params), model(model) {
|
||||
llm_build_delta_net_base(params), model(model) {
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
||||
@@ -86,17 +84,6 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
|
||||
// Output ids for selecting which tokens to output
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
ggml_tensor * chunked_causal_mask =
|
||||
ggml_tri(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, CHUNK_SIZE, CHUNK_SIZE), 1.0f),
|
||||
GGML_TRI_TYPE_LOWER);
|
||||
|
||||
ggml_tensor * chunked_identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, CHUNK_SIZE), 1.0f));
|
||||
ggml_tensor * chunked_diag_mask = ggml_add(ctx0, chunked_causal_mask, chunked_identity);
|
||||
|
||||
ggml_build_forward_expand(gf, chunked_causal_mask);
|
||||
ggml_build_forward_expand(gf, chunked_identity);
|
||||
ggml_build_forward_expand(gf, chunked_diag_mask);
|
||||
|
||||
// Kimi dimension constants
|
||||
const int64_t n_head = hparams.n_head();
|
||||
const int64_t head_dim = hparams.n_embd_head_kda;
|
||||
@@ -162,27 +149,35 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
|
||||
g1 = ggml_mul(ctx0, g1, A);
|
||||
cb(g1, "kda_g1", il);
|
||||
|
||||
g1 = ggml_reshape_4d(ctx0, g1, head_dim, n_head, n_seq_tokens, n_seqs);
|
||||
|
||||
// Compute beta (mixing coefficient)
|
||||
ggml_tensor * beta = ggml_mul_mat(ctx0, layer.ssm_beta, cur);
|
||||
beta = ggml_reshape_4d(ctx0, beta, n_head, 1, n_seq_tokens, n_seqs);
|
||||
beta = ggml_reshape_4d(ctx0, beta, 1, n_head, n_seq_tokens, n_seqs);
|
||||
cb(beta, "kda_beta", il);
|
||||
|
||||
beta = ggml_sigmoid(ctx0, beta);
|
||||
|
||||
// Reshape for KDA recurrence
|
||||
// {n_embd, n_tokens} -> {n_embd, n_seq_tokens, n_seqs}
|
||||
cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
|
||||
|
||||
g1 = ggml_reshape_4d(ctx0, g1, head_dim, n_head, n_seq_tokens, n_seqs);
|
||||
|
||||
// Get SSM state and compute KDA recurrence using ggml_kda_scan
|
||||
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
|
||||
ggml_tensor * state = build_rs(inp_rs, ssm_states_all, hparams.n_embd_s(), n_seqs);
|
||||
state = ggml_reshape_4d(ctx0, state, head_dim, head_dim, n_head, n_seqs);
|
||||
// Choose between build_kda_chunking and build_kda_recurrent based on n_tokens
|
||||
std::pair<ggml_tensor *, ggml_tensor *> attn_out = n_seq_tokens == 1 ?
|
||||
build_kda_autoregressive(Qcur, Kcur, Vcur, g1, beta, state, il) :
|
||||
build_kda_chunking(Qcur, Kcur, Vcur, g1, beta, state, chunked_causal_mask, chunked_identity, chunked_diag_mask, il);
|
||||
|
||||
ggml_tensor * output = attn_out.first;
|
||||
const float eps_norm = hparams.f_norm_rms_eps;
|
||||
|
||||
Qcur = ggml_l2_norm(ctx0, Qcur, eps_norm);
|
||||
Kcur = ggml_l2_norm(ctx0, Kcur, eps_norm);
|
||||
|
||||
// Choose between build_delta_net_chunking and build_delta_net_recurrent based on n_tokens
|
||||
std::pair<ggml_tensor *, ggml_tensor *> attn_out = n_seq_tokens == 1 ?
|
||||
build_delta_net_autoregressive(Qcur, Kcur, Vcur, g1, beta, state, il) :
|
||||
build_delta_net_chunking(Qcur, Kcur, Vcur, g1, beta, state, il);
|
||||
|
||||
ggml_tensor * output = ggml_cont(ctx0, attn_out.first);
|
||||
ggml_tensor * new_state = attn_out.second;
|
||||
cb(output, "attn_output", il);
|
||||
cb(new_state, "new_state", il);
|
||||
@@ -393,385 +388,3 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
|
||||
/*
|
||||
This is a ggml implementation of the naive_chunk_kda function of
|
||||
https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/kda/naive.py
|
||||
*/
|
||||
std::pair<ggml_tensor *, ggml_tensor *> llm_build_kimi_linear::build_kda_chunking(
|
||||
ggml_tensor * q,
|
||||
ggml_tensor * k,
|
||||
ggml_tensor * v,
|
||||
ggml_tensor * gk,
|
||||
ggml_tensor * beta,
|
||||
ggml_tensor * state,
|
||||
ggml_tensor * causal_mask,
|
||||
ggml_tensor * identity,
|
||||
ggml_tensor * diag_mask,
|
||||
int il) {
|
||||
GGML_ASSERT(ggml_is_contiguous(state));
|
||||
|
||||
const int64_t S_k = q->ne[0];
|
||||
const int64_t H_k = q->ne[1];
|
||||
const int64_t n_tokens = q->ne[2];
|
||||
const int64_t n_seqs = q->ne[3];
|
||||
|
||||
const int64_t S_v = v->ne[0];
|
||||
const int64_t H_v = v->ne[1];
|
||||
|
||||
GGML_ASSERT(v->ne[2] == n_tokens);
|
||||
GGML_ASSERT(k->ne[2] == n_tokens);
|
||||
GGML_ASSERT(gk->ne[0] == S_v && gk->ne[1] == H_v && gk->ne[2] == n_tokens && gk->ne[3] == n_seqs);
|
||||
GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
|
||||
GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v && state->ne[2] == H_v && state->ne[3] == n_seqs);
|
||||
|
||||
GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
|
||||
GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
|
||||
|
||||
GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
|
||||
|
||||
// TODO: can this ever be false?
|
||||
const bool use_qk_l2norm = true;
|
||||
|
||||
if (use_qk_l2norm) {
|
||||
const float eps_norm = hparams.f_norm_rms_eps;
|
||||
|
||||
q = ggml_l2_norm(ctx0, q, eps_norm);
|
||||
k = ggml_l2_norm(ctx0, k, eps_norm);
|
||||
}
|
||||
|
||||
const float scale = 1.0f / sqrtf(S_v);
|
||||
|
||||
beta = ggml_sigmoid(ctx0, beta);
|
||||
|
||||
cb(q, "q_in", il);
|
||||
cb(k, "k_in", il);
|
||||
cb(v, "v_in", il);
|
||||
cb(beta, "beta_in", il);
|
||||
cb(gk, "gk_in", il);
|
||||
|
||||
q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_k, n_tokens, H_k, n_seqs);
|
||||
k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_k, n_tokens, H_k, n_seqs);
|
||||
v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
|
||||
gk = ggml_cont_4d(ctx0, ggml_permute(ctx0, gk, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
|
||||
|
||||
beta = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3));
|
||||
state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
|
||||
|
||||
cb(q, "q_perm", il);
|
||||
cb(k, "k_perm", il);
|
||||
cb(v, "v_perm", il);
|
||||
cb(beta, "beta_perm", il);
|
||||
cb(gk, "gk_perm", il);
|
||||
cb(state, "state_in", il);
|
||||
|
||||
GGML_ASSERT(q->ne[1] == n_tokens && q->ne[0] == S_k && q->ne[2] == H_k && q->ne[3] == n_seqs);
|
||||
GGML_ASSERT(k->ne[1] == n_tokens && k->ne[0] == S_k && k->ne[2] == H_k && k->ne[3] == n_seqs);
|
||||
GGML_ASSERT(v->ne[1] == n_tokens && v->ne[0] == S_v && v->ne[2] == H_k && v->ne[3] == n_seqs);
|
||||
GGML_ASSERT(beta->ne[1] == n_tokens && beta->ne[2] == H_k && beta->ne[0] == 1 && beta->ne[3] == n_seqs);
|
||||
|
||||
// Do padding
|
||||
const int64_t chunk_size = CHUNK_SIZE;
|
||||
|
||||
const int64_t pad = (chunk_size - n_tokens % chunk_size) % chunk_size;
|
||||
const int64_t n_chunks = (n_tokens + pad) / chunk_size;
|
||||
|
||||
q = ggml_pad(ctx0, q, 0, pad, 0, 0);
|
||||
k = ggml_pad(ctx0, k, 0, pad, 0, 0);
|
||||
v = ggml_pad(ctx0, v, 0, pad, 0, 0);
|
||||
gk = ggml_pad(ctx0, gk, 0, pad, 0, 0);
|
||||
beta = ggml_pad(ctx0, beta, 0, pad, 0, 0);
|
||||
|
||||
cb(q, "q_pad", il);
|
||||
cb(k, "k_pad", il);
|
||||
cb(v, "v_pad", il);
|
||||
cb(beta, "beta_pad", il);
|
||||
cb(gk, "gk_pad", il);
|
||||
|
||||
ggml_tensor * v_beta = ggml_mul(ctx0, v, beta);
|
||||
ggml_tensor * k_beta = ggml_mul(ctx0, k, beta);
|
||||
|
||||
cb(v_beta, "v_beta", il);
|
||||
cb(k_beta, "k_beta", il);
|
||||
|
||||
const int64_t HB = H_k * n_seqs;
|
||||
|
||||
q = ggml_cont_4d(ctx0, q, S_k, chunk_size, n_chunks, HB);
|
||||
k = ggml_cont_4d(ctx0, k, S_k, chunk_size, n_chunks, HB);
|
||||
k_beta = ggml_cont_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, HB);
|
||||
v = ggml_cont_4d(ctx0, v, S_v, chunk_size, n_chunks, HB);
|
||||
v_beta = ggml_cont_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, HB);
|
||||
|
||||
gk = ggml_cont_4d(ctx0, gk, S_k, chunk_size, n_chunks, HB);
|
||||
beta = ggml_cont_4d(ctx0, beta, 1, chunk_size, n_chunks, HB);
|
||||
|
||||
// switch for cumsum
|
||||
gk = ggml_cont_4d(ctx0, ggml_permute(ctx0, gk, 1, 0, 2, 3), chunk_size, S_k, n_chunks, HB);
|
||||
cb(gk, "gk", il);
|
||||
ggml_tensor * gk_cumsum = ggml_cumsum(ctx0, gk);
|
||||
cb(gk_cumsum, "gk_cumsum", il);
|
||||
|
||||
/*
|
||||
Compute Akk and Aqk loop together
|
||||
Akk loop:
|
||||
for i in range(BT):
|
||||
k_i = k[..., i, :] # k_i [B,H,NT,S]
|
||||
g_i = g[..., i:i+1, :] # g_i [B,H,NT,1,S]
|
||||
A[..., i] = torch.einsum('... c d, ... d -> ... c', k * (g - g_i).exp(), k_i)
|
||||
Aqk loop:
|
||||
for j in range(BT):
|
||||
k_j = k[:, :, i, j]
|
||||
g_j = g[:, :, i, j:j+1, :]
|
||||
A[..., j] = torch.einsum('... c d, ... d -> ... c', q_i * (g_i - g_j).exp(), k_j)
|
||||
*/
|
||||
const int64_t CHB = n_chunks * H_k * n_seqs;
|
||||
ggml_tensor * gkcs_i = ggml_reshape_4d(ctx0, gk_cumsum, chunk_size, 1, S_k, CHB); // [chunk_size, 1, S_k, CHB]
|
||||
ggml_tensor * gkcs_j = ggml_reshape_4d(ctx0, gkcs_i, 1, chunk_size, S_k, CHB); // [1, chunk_size, S_k, CHB]
|
||||
|
||||
ggml_tensor * gkcs_j_bc = ggml_repeat_4d(ctx0, gkcs_j, chunk_size, chunk_size, S_k, CHB); // [1, chunk_size, S_k, CHB] -> [chunk_size, chunk_size, S_k, CHB]
|
||||
// decay_mask [chunk_size,chunk_size,S_k,CHB]
|
||||
ggml_tensor * decay_mask = ggml_sub(ctx0, gkcs_j_bc, gkcs_i);
|
||||
cb(decay_mask, "decay_mask", il);
|
||||
|
||||
decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
|
||||
cb(decay_mask, "decay_masked", il);
|
||||
decay_mask = ggml_exp(ctx0, decay_mask);
|
||||
decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
|
||||
|
||||
// decay_mask [S_k,BT_j,BT_i,CHB] *Note* second and third chunk_sizes are switched
|
||||
decay_mask = ggml_cont_4d(ctx0, ggml_permute(ctx0, decay_mask, 2, 1, 0, 3), S_k, chunk_size, chunk_size, CHB);
|
||||
|
||||
ggml_tensor * k_i = ggml_reshape_4d(ctx0, k, S_k, chunk_size, 1, CHB);
|
||||
ggml_tensor * k_j = ggml_reshape_4d(ctx0, k, S_k, 1, chunk_size, CHB);
|
||||
ggml_tensor * q_i = ggml_reshape_4d(ctx0, q, S_k, chunk_size, 1, CHB);
|
||||
|
||||
ggml_tensor * decay_k_i = ggml_mul(ctx0, decay_mask, k_i);
|
||||
ggml_tensor * decay_q_i = ggml_mul(ctx0, decay_mask, q_i);
|
||||
|
||||
// decay_k_i [S.BT,BT,CHB] @ k_j [S,1,BT,CHB] = Akk [BT,1,BT,CHB]
|
||||
ggml_tensor * Akk = ggml_mul_mat(ctx0, decay_k_i, k_j);
|
||||
ggml_tensor * Aqk = ggml_mul_mat(ctx0, decay_q_i, k_j);
|
||||
Akk = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_4d(ctx0, Akk, chunk_size, chunk_size, n_chunks, HB)));
|
||||
Aqk = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_4d(ctx0, Aqk, chunk_size, chunk_size, n_chunks, HB)));
|
||||
cb(Akk, "Akk", il);
|
||||
cb(Aqk, "Aqk", il);
|
||||
|
||||
Akk = ggml_mul(ctx0, Akk, beta);
|
||||
Akk = ggml_neg(ctx0, ggml_mul(ctx0, Akk, causal_mask));
|
||||
cb(Akk, "attn_pre_solve", il);
|
||||
|
||||
Aqk = ggml_mul(ctx0, Aqk, diag_mask);
|
||||
Aqk = ggml_scale(ctx0, Aqk, scale); // scale q
|
||||
cb(Aqk, "Aqk_masked", il);
|
||||
|
||||
// for i in range(1, chunk_size):
|
||||
// row = attn[..., i, :i].clone()
|
||||
// sub = attn[..., :i, :i].clone()
|
||||
// attn[..., i, :i] = row + (row.unsqueeze(-1) * sub).sum(-2)
|
||||
// attn = attn + torch.eye(chunk_size, dtype=attn.dtype, device=attn.device)
|
||||
//
|
||||
// We reduce this to a linear triangular solve: AX = B, where B = attn, A = I - tril(A)
|
||||
ggml_tensor * attn_lower = ggml_mul(ctx0, Akk, causal_mask);
|
||||
ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower);
|
||||
|
||||
ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, Akk, true, true, false);
|
||||
Akk = ggml_mul(ctx0, lin_solve, causal_mask);
|
||||
Akk = ggml_add(ctx0, Akk, identity);
|
||||
|
||||
cb(Akk, "attn_solved", il);
|
||||
|
||||
// switch back for downstream
|
||||
gk_cumsum = ggml_cont_4d(ctx0, ggml_permute(ctx0, gk_cumsum, 1, 0, 2, 3), S_k, chunk_size, n_chunks, HB);
|
||||
ggml_tensor * gkexp = ggml_exp(ctx0, gk_cumsum);
|
||||
cb(gk_cumsum, "gk_cumsum", il);
|
||||
|
||||
// u = (A*beta[..., None, :]) @ v aka U_[t]
|
||||
ggml_tensor * vb = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), Akk);
|
||||
|
||||
ggml_tensor * kbeta_gkexp = ggml_mul(ctx0, k_beta, gkexp);
|
||||
cb(kbeta_gkexp, "kbeta_gkexp", il);
|
||||
|
||||
ggml_tensor * k_cumdecay = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gkexp)), Akk);
|
||||
cb(k_cumdecay, "k_cumdecay", il);
|
||||
|
||||
ggml_tensor * core_attn_out = nullptr;
|
||||
ggml_tensor * new_state = ggml_dup(ctx0, state);
|
||||
|
||||
cb(new_state, "new_state", il);
|
||||
|
||||
for (int64_t chunk = 0; chunk < n_chunks; chunk++) {
|
||||
// extract one chunk worth of data
|
||||
auto chunkify = [=](ggml_tensor * t) {
|
||||
return ggml_cont(ctx0, ggml_view_4d(ctx0, t, t->ne[0], chunk_size, 1, t->ne[3],
|
||||
t->nb[1], t->nb[2], t->nb[3], t->nb[2] * chunk));
|
||||
};
|
||||
auto chunkify_A = [=](ggml_tensor * t) {
|
||||
return ggml_cont(ctx0, ggml_view_4d(ctx0, t, chunk_size, chunk_size, 1, t->ne[3],
|
||||
t->nb[1], t->nb[2], t->nb[3], t->nb[2] * chunk));
|
||||
};
|
||||
|
||||
|
||||
// k [S,BT,NT,H*B] => k_chunk [S,BT,1,H*B]
|
||||
ggml_tensor * k_chunk = chunkify(k);
|
||||
ggml_tensor * q_chunk = chunkify(q);
|
||||
ggml_tensor * vb_chunk = chunkify(vb);
|
||||
|
||||
// gk_cumsum [S,BT,NT,H*B] => gk_cs_chunk [S,BT,1,H*B]
|
||||
ggml_tensor * gk_cs_chunk = chunkify(gk_cumsum);
|
||||
ggml_tensor * k_cumdecay_chunk = chunkify(k_cumdecay);
|
||||
ggml_tensor * gkexp_chunk = ggml_exp(ctx0, gk_cs_chunk);
|
||||
ggml_tensor * Aqk_chunk = chunkify_A(Aqk);
|
||||
|
||||
ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs);
|
||||
|
||||
// new_state [S,S,1,H*B] k_cumdecay_chunk [S,BT,1,H*B]
|
||||
// v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state or W_[t] @ S_[t]
|
||||
ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay_chunk);
|
||||
|
||||
// v_new = v_i - v_prime or U_[t] - W_[t]*S_[t]
|
||||
ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, vb_chunk, v_prime), v_prime);
|
||||
ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new));
|
||||
|
||||
// q_chunk [S,BT,1,H*B] gkexp_chunk [S,BT,1,H*B]
|
||||
// attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
|
||||
// or Gamma_[t]*Q_]t] @ S
|
||||
ggml_tensor * q_gk_exp = ggml_mul(ctx0, q_chunk, gkexp_chunk);
|
||||
ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_gk_exp);
|
||||
attn_inter = ggml_scale(ctx0, attn_inter, scale); // scale q
|
||||
|
||||
// v_new_t [S,BT,1,H*B] Aqk [BT,BT,1,H*B]
|
||||
// core_attn_out[:, :, i] = attn_inter + attn @ v_new or A' @ (U_[t] - W_[t]*S_[t])
|
||||
ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, Aqk_chunk);
|
||||
|
||||
// o[:, :, i] = (q_i * g_i.exp()) @ S + A @ v_i
|
||||
ggml_tensor * core_attn_out_chunk = ggml_add(ctx0, attn_inter, v_attn);
|
||||
|
||||
core_attn_out = core_attn_out == nullptr ? core_attn_out_chunk : ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 1);
|
||||
|
||||
ggml_tensor * gk_cum_last =
|
||||
ggml_cont(ctx0, ggml_view_4d(ctx0, gk_cs_chunk, gk_cs_chunk->ne[0], 1, gk_cs_chunk->ne[2], gk_cs_chunk->ne[3],
|
||||
gk_cs_chunk->nb[1], gk_cs_chunk->nb[2], gk_cs_chunk->nb[3],
|
||||
gk_cs_chunk->nb[1] * (gk_cs_chunk->ne[1] - 1)));
|
||||
|
||||
ggml_tensor * gkexp_last = ggml_exp(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, gk_cum_last)));
|
||||
|
||||
ggml_tensor * gk_diff = ggml_neg(ctx0, ggml_sub(ctx0, gk_cs_chunk, gk_cum_last));
|
||||
|
||||
ggml_tensor * gk_diff_exp = ggml_exp(ctx0, gk_diff);
|
||||
|
||||
ggml_tensor * key_gkdiff = ggml_mul(ctx0, k_chunk, gk_diff_exp);
|
||||
|
||||
// rearrange((g_i[:,:,-1:] - g_i).exp()*k_i, 'b h c k -> b h k c') @ (U_[t] - W_[t] @ S)
|
||||
ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, ggml_cont(ctx0, ggml_transpose(ctx0, key_gkdiff)));
|
||||
|
||||
new_state = ggml_add(ctx0,
|
||||
ggml_mul(ctx0, new_state, ggml_reshape_4d(ctx0, gkexp_last, gkexp_last->ne[0], gkexp_last->ne[1], H_v, n_seqs)),
|
||||
ggml_reshape_4d(ctx0, kgdmulvnew, kgdmulvnew->ne[0], kgdmulvnew->ne[1], H_v, n_seqs));
|
||||
}
|
||||
|
||||
core_attn_out = ggml_cont_4d(ctx0, core_attn_out, S_v, chunk_size * n_chunks, H_v, n_seqs);
|
||||
|
||||
// truncate padded tokens
|
||||
ggml_tensor * output_tokens = ggml_view_4d(ctx0, core_attn_out,
|
||||
S_v, n_tokens, H_v, n_seqs,
|
||||
ggml_row_size(core_attn_out->type, S_v),
|
||||
ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks),
|
||||
ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks * H_v), 0);
|
||||
output_tokens = ggml_cont(ctx0, output_tokens);
|
||||
// permute back to (S_v, H_v, n_tokens, n_seqs)
|
||||
output_tokens = ggml_permute(ctx0, output_tokens, 0, 2, 1, 3);
|
||||
output_tokens = ggml_cont(ctx0, output_tokens);
|
||||
|
||||
cb(new_state, "output_state", il);
|
||||
|
||||
return {output_tokens, new_state};
|
||||
}
|
||||
|
||||
std::pair<ggml_tensor *, ggml_tensor *> llm_build_kimi_linear::build_kda_autoregressive(
|
||||
ggml_tensor * q,
|
||||
ggml_tensor * k,
|
||||
ggml_tensor * v,
|
||||
ggml_tensor * gk,
|
||||
ggml_tensor * beta,
|
||||
ggml_tensor * state,
|
||||
int il) {
|
||||
GGML_ASSERT(ggml_is_contiguous(v));
|
||||
GGML_ASSERT(ggml_is_contiguous(gk));
|
||||
|
||||
const int64_t S_k = q->ne[0];
|
||||
const int64_t H_k = q->ne[1];
|
||||
const int64_t n_tokens = q->ne[2];
|
||||
const int64_t n_seqs = q->ne[3];
|
||||
|
||||
const int64_t S_v = v->ne[0];
|
||||
const int64_t H_v = v->ne[1];
|
||||
|
||||
GGML_ASSERT(n_tokens == 1);
|
||||
GGML_ASSERT(v->ne[2] == n_tokens);
|
||||
GGML_ASSERT(k->ne[2] == n_tokens);
|
||||
GGML_ASSERT(gk->ne[0] == S_k && gk->ne[1] == H_k && gk->ne[2] == n_tokens && gk->ne[3] == n_seqs);
|
||||
GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
|
||||
GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_k && state->ne[2] == H_v && state->ne[3] == n_seqs);
|
||||
|
||||
GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
|
||||
GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
|
||||
|
||||
GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
|
||||
|
||||
const float eps_norm = hparams.f_norm_rms_eps;
|
||||
|
||||
q = ggml_l2_norm(ctx0, q, eps_norm);
|
||||
k = ggml_l2_norm(ctx0, k, eps_norm);
|
||||
|
||||
const float scale = 1.0f / sqrtf(S_v);
|
||||
|
||||
q = ggml_scale(ctx0, q, scale);
|
||||
beta = ggml_sigmoid(ctx0, beta);
|
||||
|
||||
cb(q, "q_in", il);
|
||||
cb(k, "k_in", il);
|
||||
cb(v, "v_in", il);
|
||||
cb(beta, "beta_in", il);
|
||||
cb(gk, "gk_in", il);
|
||||
|
||||
// g [H,1,B,1] g_t [1,H,B,1] => [1,1,H,B]
|
||||
// gk [S,H,1,B] => [S,1,H,B] gk_t [1,S,H,B]
|
||||
// beta [H,1,1,B] beta_t [1,H,1,B] => [1,1,H,B]
|
||||
gk = ggml_reshape_4d(ctx0, gk, S_k, 1, H_k, n_seqs);
|
||||
ggml_tensor * gk_t = ggml_cont(ctx0, ggml_transpose(ctx0, gk));
|
||||
ggml_tensor * beta_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, beta), 1, 1, H_k, n_seqs);
|
||||
|
||||
// Apply exponential to gk_t
|
||||
gk_t = ggml_exp(ctx0, gk_t);
|
||||
// Apply the gated delta rule for the single timestep
|
||||
// last_recurrent_state = last_recurrent_state * gk_t
|
||||
// S = S * g_i[..., None].exp()
|
||||
state = ggml_mul(ctx0, state, gk_t);
|
||||
|
||||
ggml_tensor * state_t = ggml_cont(ctx0, ggml_transpose(ctx0, state));
|
||||
|
||||
// state [S,S,H,B] k [S,1,H,B] k_state [S_v,1,H,B]
|
||||
k = ggml_reshape_4d(ctx0, k, S_k, 1, H_k, n_seqs);
|
||||
ggml_tensor * k_state = ggml_mul_mat(ctx0, state_t, k);
|
||||
|
||||
// v_i - (k_i[..., None] * S).sum(-2)
|
||||
v = ggml_reshape_4d(ctx0, v, S_v, 1, H_v, n_seqs);
|
||||
ggml_tensor * v_diff = ggml_sub(ctx0, v, k_state);
|
||||
|
||||
// b_i[..., None] * k_i
|
||||
ggml_tensor * k_beta = ggml_mul(ctx0, k, beta_t);
|
||||
|
||||
// S = S + torch.einsum('b h k, b h v -> b h k v', b_i[..., None] * k_i, v_i - (k_i[..., None] * S).sum(-2))
|
||||
// v_diff_t [1,S_v,H,B] k_beta_t [1,S_k,H,B] state [S_v,S_k,H,B]
|
||||
state = ggml_add(ctx0, state, ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_diff)), ggml_cont(ctx0, ggml_transpose(ctx0, k_beta))));
|
||||
|
||||
q = ggml_reshape_4d(ctx0, q, S_k, 1, H_k, n_seqs);
|
||||
state_t = ggml_cont(ctx0, ggml_transpose(ctx0, state));
|
||||
ggml_tensor * core_attn_out = ggml_mul_mat(ctx0, state_t, q);
|
||||
// core_attn_out should be [S_v, 1, H_v, n_seqs] after this
|
||||
cb(core_attn_out, "output_tokens", il);
|
||||
cb(state, "new_state", il);
|
||||
|
||||
return {core_attn_out, state};
|
||||
}
|
||||
|
||||
|
||||
@@ -1,18 +1,149 @@
|
||||
#include "models.h"
|
||||
|
||||
#include "../llama-memory-hybrid-iswa.h"
|
||||
#include "../llama-memory-hybrid.h"
|
||||
|
||||
template <bool iswa>
|
||||
llm_build_lfm2<iswa>::llm_build_lfm2(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
using inp_hybrid_type = std::conditional_t<iswa, llm_graph_input_mem_hybrid_iswa, llm_graph_input_mem_hybrid>;
|
||||
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
|
||||
using mem_hybrid_ctx = std::conditional_t<iswa, llama_memory_hybrid_iswa_context, llama_memory_hybrid_context>;
|
||||
|
||||
llm_build_lfm2::llm_build_lfm2(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params),
|
||||
model(model) {
|
||||
// lambda helpers for readability
|
||||
auto build_dense_feed_forward = [&model, this](ggml_tensor * cur, int il) -> ggml_tensor * {
|
||||
GGML_ASSERT(!model.layers[il].ffn_up_b);
|
||||
GGML_ASSERT(!model.layers[il].ffn_gate_b);
|
||||
GGML_ASSERT(!model.layers[il].ffn_down_b);
|
||||
return build_ffn(cur,
|
||||
model.layers[il].ffn_up, NULL, NULL,
|
||||
model.layers[il].ffn_gate, NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL, NULL,
|
||||
NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
};
|
||||
auto build_moe_feed_forward = [&model, this](ggml_tensor * cur, int il) -> ggml_tensor * {
|
||||
return build_moe_ffn(cur,
|
||||
model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
|
||||
model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
|
||||
model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false, 0.0,
|
||||
static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func), il);
|
||||
};
|
||||
auto build_attn_block = [&model, this](ggml_tensor * cur,
|
||||
ggml_tensor * inp_pos,
|
||||
inp_attn_type * inp_attn,
|
||||
int il) -> ggml_tensor * {
|
||||
GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il));
|
||||
const auto n_embd_head = hparams.n_embd_head_v;
|
||||
const auto n_head_kv = hparams.n_head_kv(il);
|
||||
|
||||
auto * q = build_lora_mm(model.layers[il].wq, cur);
|
||||
cb(q, "model.layers.{}.self_attn.q_proj", il);
|
||||
auto * k = build_lora_mm(model.layers[il].wk, cur);
|
||||
cb(k, "model.layers.{}.self_attn.k_proj", il);
|
||||
auto * v = build_lora_mm(model.layers[il].wv, cur);
|
||||
cb(v, "model.layers.{}.self_attn.v_proj", il);
|
||||
|
||||
q = ggml_reshape_3d(ctx0, q, n_embd_head, n_head, n_tokens);
|
||||
k = ggml_reshape_3d(ctx0, k, n_embd_head, n_head_kv, n_tokens);
|
||||
v = ggml_reshape_3d(ctx0, v, n_embd_head, n_head_kv, n_tokens);
|
||||
|
||||
// qk norm
|
||||
q = build_norm(q, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(q, "model.layers.{}.self_attn.q_layernorm", il);
|
||||
k = build_norm(k, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(k, "model.layers.{}.self_attn.k_layernorm", il);
|
||||
|
||||
// RoPE
|
||||
q = ggml_rope_ext(ctx0, q, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor,
|
||||
attn_factor, beta_fast, beta_slow);
|
||||
k = ggml_rope_ext(ctx0, k, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor,
|
||||
attn_factor, beta_fast, beta_slow);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, NULL,
|
||||
q, k, v, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||
|
||||
cb(cur, "model.layers.{}.self_attn.out_proj", il);
|
||||
|
||||
return cur;
|
||||
};
|
||||
auto build_shortconv_block = [&model, this](ggml_tensor * cur,
|
||||
llm_graph_input_rs * inp_recr,
|
||||
int il) -> ggml_tensor * {
|
||||
const auto * mctx_cur = static_cast<const mem_hybrid_ctx *>(mctx)->get_recr();
|
||||
const uint32_t kv_head = mctx_cur->get_head();
|
||||
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
||||
const int64_t n_seqs = ubatch.n_seqs;
|
||||
GGML_ASSERT(n_seqs != 0);
|
||||
GGML_ASSERT(ubatch.equal_seqs());
|
||||
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
|
||||
|
||||
GGML_ASSERT(hparams.n_shortconv_l_cache > 1);
|
||||
const uint32_t d_conv = hparams.n_shortconv_l_cache - 1;
|
||||
|
||||
// {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
|
||||
cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
|
||||
|
||||
auto * bcx = build_lora_mm(model.layers[il].shortconv.in_proj, cur);
|
||||
cb(bcx, "model.layers.{}.conv.in_proj", il);
|
||||
|
||||
constexpr auto n_chunks = 3;
|
||||
GGML_ASSERT(bcx->ne[0] % n_chunks == 0);
|
||||
const auto chunk_size = bcx->ne[0] / n_chunks;
|
||||
auto * b = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2],
|
||||
0 * chunk_size * ggml_element_size(bcx));
|
||||
auto * c = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2],
|
||||
1 * chunk_size * ggml_element_size(bcx));
|
||||
auto * x = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2],
|
||||
2 * chunk_size * ggml_element_size(bcx));
|
||||
|
||||
auto * bx = ggml_transpose(ctx0, ggml_mul(ctx0, b, x));
|
||||
|
||||
// read conv state
|
||||
auto * conv_state = mctx_cur->get_r_l(il);
|
||||
auto * conv_rs = build_rs(inp_recr, conv_state, hparams.n_embd_r(), n_seqs);
|
||||
auto * conv = ggml_reshape_3d(ctx0, conv_rs, d_conv, hparams.n_embd, n_seqs);
|
||||
|
||||
bx = ggml_concat(ctx0, conv, bx, 0);
|
||||
GGML_ASSERT(bx->ne[0] > conv->ne[0]);
|
||||
|
||||
// last d_conv columns is a new conv state
|
||||
auto * new_conv = ggml_view_3d(ctx0, bx, conv->ne[0], bx->ne[1], bx->ne[2], bx->nb[1], bx->nb[2],
|
||||
(bx->ne[0] - conv->ne[0]) * ggml_element_size(bx));
|
||||
GGML_ASSERT(ggml_are_same_shape(conv, new_conv));
|
||||
|
||||
// write new conv conv state
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, new_conv,
|
||||
ggml_view_1d(ctx0, conv_state, ggml_nelements(new_conv),
|
||||
kv_head * d_conv * n_embd * ggml_element_size(new_conv))));
|
||||
|
||||
auto * conv_kernel = model.layers[il].shortconv.conv;
|
||||
auto * conv_out = ggml_ssm_conv(ctx0, bx, conv_kernel);
|
||||
cb(conv_out, "model.layers.{}.conv.conv", il);
|
||||
|
||||
auto * y = ggml_mul(ctx0, c, conv_out);
|
||||
y = build_lora_mm(model.layers[il].shortconv.out_proj, y);
|
||||
cb(y, "model.layers.{}.conv.out_proj", il);
|
||||
// {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
|
||||
y = ggml_reshape_2d(ctx0, y, y->ne[0], n_seq_tokens * n_seqs);
|
||||
|
||||
return y;
|
||||
};
|
||||
|
||||
// actual graph construction starts here
|
||||
ggml_tensor * cur = build_inp_embd(model.tok_embd);
|
||||
cb(cur, "model.embed_tokens", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
inp_hybrid_type * inp_hybrid = nullptr;
|
||||
if constexpr (iswa) {
|
||||
inp_hybrid = build_inp_mem_hybrid_iswa();
|
||||
} else {
|
||||
inp_hybrid = build_inp_mem_hybrid();
|
||||
}
|
||||
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
auto * inp_hybrid = build_inp_mem_hybrid();
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@@ -54,122 +185,6 @@ llm_build_lfm2::llm_build_lfm2(const llama_model & model, const llm_graph_params
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
|
||||
ggml_tensor * llm_build_lfm2::build_moe_feed_forward(ggml_tensor * cur, int il) const {
|
||||
return build_moe_ffn(cur,
|
||||
model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
|
||||
model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
|
||||
model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false, 0.0,
|
||||
static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func), il);
|
||||
}
|
||||
|
||||
ggml_tensor * llm_build_lfm2::build_dense_feed_forward(ggml_tensor * cur, int il) const {
|
||||
GGML_ASSERT(!model.layers[il].ffn_up_b);
|
||||
GGML_ASSERT(!model.layers[il].ffn_gate_b);
|
||||
GGML_ASSERT(!model.layers[il].ffn_down_b);
|
||||
return build_ffn(cur,
|
||||
model.layers[il].ffn_up, NULL, NULL,
|
||||
model.layers[il].ffn_gate, NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL, NULL,
|
||||
NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
}
|
||||
|
||||
ggml_tensor * llm_build_lfm2::build_attn_block(ggml_tensor * cur,
|
||||
ggml_tensor * inp_pos,
|
||||
llm_graph_input_attn_kv * inp_attn,
|
||||
int il) const {
|
||||
GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il));
|
||||
const auto n_embd_head = hparams.n_embd_head_v;
|
||||
const auto n_head_kv = hparams.n_head_kv(il);
|
||||
|
||||
auto * q = build_lora_mm(model.layers[il].wq, cur);
|
||||
cb(q, "model.layers.{}.self_attn.q_proj", il);
|
||||
auto * k = build_lora_mm(model.layers[il].wk, cur);
|
||||
cb(k, "model.layers.{}.self_attn.k_proj", il);
|
||||
auto * v = build_lora_mm(model.layers[il].wv, cur);
|
||||
cb(v, "model.layers.{}.self_attn.v_proj", il);
|
||||
|
||||
q = ggml_reshape_3d(ctx0, q, n_embd_head, n_head, n_tokens);
|
||||
k = ggml_reshape_3d(ctx0, k, n_embd_head, n_head_kv, n_tokens);
|
||||
v = ggml_reshape_3d(ctx0, v, n_embd_head, n_head_kv, n_tokens);
|
||||
|
||||
// qk norm
|
||||
q = build_norm(q, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(q, "model.layers.{}.self_attn.q_layernorm", il);
|
||||
k = build_norm(k, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(k, "model.layers.{}.self_attn.k_layernorm", il);
|
||||
|
||||
// RoPE
|
||||
q = ggml_rope_ext(ctx0, q, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor,
|
||||
attn_factor, beta_fast, beta_slow);
|
||||
k = ggml_rope_ext(ctx0, k, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor,
|
||||
attn_factor, beta_fast, beta_slow);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, NULL,
|
||||
q, k, v, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||
|
||||
cb(cur, "model.layers.{}.self_attn.out_proj", il);
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
ggml_tensor * llm_build_lfm2::build_shortconv_block(ggml_tensor * cur, llm_graph_input_rs * inp_recr, int il) {
|
||||
const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx)->get_recr();
|
||||
const uint32_t kv_head = mctx_cur->get_head();
|
||||
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
||||
const int64_t n_seqs = ubatch.n_seqs;
|
||||
GGML_ASSERT(n_seqs != 0);
|
||||
GGML_ASSERT(ubatch.equal_seqs());
|
||||
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
|
||||
|
||||
GGML_ASSERT(hparams.n_shortconv_l_cache > 1);
|
||||
const uint32_t d_conv = hparams.n_shortconv_l_cache - 1;
|
||||
|
||||
// {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
|
||||
cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
|
||||
|
||||
auto * bcx = build_lora_mm(model.layers[il].shortconv.in_proj, cur);
|
||||
cb(bcx, "model.layers.{}.conv.in_proj", il);
|
||||
|
||||
constexpr auto n_chunks = 3;
|
||||
GGML_ASSERT(bcx->ne[0] % n_chunks == 0);
|
||||
const auto chunk_size = bcx->ne[0] / n_chunks;
|
||||
auto * b = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2],
|
||||
0 * chunk_size * ggml_element_size(bcx));
|
||||
auto * c = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2],
|
||||
1 * chunk_size * ggml_element_size(bcx));
|
||||
auto * x = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2],
|
||||
2 * chunk_size * ggml_element_size(bcx));
|
||||
|
||||
auto * bx = ggml_transpose(ctx0, ggml_mul(ctx0, b, x));
|
||||
|
||||
// read conv state
|
||||
auto * conv_state = mctx_cur->get_r_l(il);
|
||||
auto * conv_rs = build_rs(inp_recr, conv_state, hparams.n_embd_r(), n_seqs);
|
||||
auto * conv = ggml_reshape_3d(ctx0, conv_rs, d_conv, hparams.n_embd, n_seqs);
|
||||
|
||||
bx = ggml_concat(ctx0, conv, bx, 0);
|
||||
GGML_ASSERT(bx->ne[0] > conv->ne[0]);
|
||||
|
||||
// last d_conv columns is a new conv state
|
||||
auto * new_conv = ggml_view_3d(ctx0, bx, conv->ne[0], bx->ne[1], bx->ne[2], bx->nb[1], bx->nb[2],
|
||||
(bx->ne[0] - conv->ne[0]) * ggml_element_size(bx));
|
||||
GGML_ASSERT(ggml_are_same_shape(conv, new_conv));
|
||||
|
||||
// write new conv conv state
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, new_conv,
|
||||
ggml_view_1d(ctx0, conv_state, ggml_nelements(new_conv),
|
||||
kv_head * d_conv * n_embd * ggml_element_size(new_conv))));
|
||||
|
||||
auto * conv_kernel = model.layers[il].shortconv.conv;
|
||||
auto * conv_out = ggml_ssm_conv(ctx0, bx, conv_kernel);
|
||||
cb(conv_out, "model.layers.{}.conv.conv", il);
|
||||
|
||||
auto * y = ggml_mul(ctx0, c, conv_out);
|
||||
y = build_lora_mm(model.layers[il].shortconv.out_proj, y);
|
||||
cb(y, "model.layers.{}.conv.out_proj", il);
|
||||
// {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
|
||||
y = ggml_reshape_2d(ctx0, y, y->ne[0], n_seq_tokens * n_seqs);
|
||||
|
||||
return y;
|
||||
}
|
||||
// Explicit template instantiations
|
||||
template struct llm_build_lfm2<true>;
|
||||
template struct llm_build_lfm2<false>;
|
||||
|
||||
@@ -190,6 +190,10 @@ struct llm_build_ernie4_5_moe : public llm_graph_context {
|
||||
llm_build_ernie4_5_moe(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
struct llm_build_paddleocr : public llm_graph_context {
|
||||
llm_build_paddleocr(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
template <bool iswa>
|
||||
struct llm_build_exaone4 : public llm_graph_context {
|
||||
llm_build_exaone4(const llama_model & model, const llm_graph_params & params);
|
||||
@@ -316,12 +320,15 @@ struct llm_build_jais : public llm_graph_context {
|
||||
llm_build_jais(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
struct llm_build_jais2 : public llm_graph_context {
|
||||
llm_build_jais2(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
struct llm_build_jamba : public llm_build_mamba_base {
|
||||
llm_build_jamba(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
// TODO: derive llm_build_delta_net_base instead
|
||||
struct llm_build_kimi_linear : public llm_build_mamba_base {
|
||||
struct llm_build_kimi_linear : public llm_build_delta_net_base {
|
||||
llm_build_kimi_linear(const llama_model & model, const llm_graph_params & params);
|
||||
|
||||
std::pair<ggml_tensor *, ggml_tensor *> build_kda_autoregressive(
|
||||
@@ -348,15 +355,9 @@ struct llm_build_kimi_linear : public llm_build_mamba_base {
|
||||
const llama_model & model;
|
||||
};
|
||||
|
||||
template <bool iswa>
|
||||
struct llm_build_lfm2 : public llm_graph_context {
|
||||
const llama_model & model;
|
||||
|
||||
llm_build_lfm2(const llama_model & model, const llm_graph_params & params);
|
||||
ggml_tensor * build_moe_feed_forward(ggml_tensor * cur, int il) const;
|
||||
ggml_tensor * build_dense_feed_forward(ggml_tensor * cur, int il) const;
|
||||
ggml_tensor * build_attn_block(ggml_tensor * cur, ggml_tensor * inp_pos, llm_graph_input_attn_kv * inp_attn, int il) const;
|
||||
ggml_tensor * build_shortconv_block(ggml_tensor * cur, llm_graph_input_rs * inp_recr, int il);
|
||||
|
||||
};
|
||||
|
||||
struct llm_build_llada : public llm_graph_context {
|
||||
@@ -542,8 +543,7 @@ private:
|
||||
const llama_model & model;
|
||||
};
|
||||
|
||||
// TODO: derive llm_build_delta_net_base instead
|
||||
struct llm_build_qwen35 : public llm_graph_context {
|
||||
struct llm_build_qwen35 : public llm_build_delta_net_base {
|
||||
llm_build_qwen35(const llama_model & model, const llm_graph_params & params);
|
||||
private:
|
||||
ggml_tensor * build_layer_attn(
|
||||
@@ -556,39 +556,12 @@ private:
|
||||
ggml_tensor * build_layer_attn_linear(
|
||||
llm_graph_input_rs * inp,
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * causal_mask,
|
||||
ggml_tensor * identity,
|
||||
ggml_tensor * diag_mask,
|
||||
int il);
|
||||
|
||||
|
||||
ggml_tensor * build_layer_ffn(
|
||||
ggml_tensor * cur,
|
||||
int il);
|
||||
|
||||
// returns pair of output and new state
|
||||
std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_chunking(
|
||||
ggml_tensor * q,
|
||||
ggml_tensor * k,
|
||||
ggml_tensor * v,
|
||||
ggml_tensor * g,
|
||||
ggml_tensor * beta,
|
||||
ggml_tensor * state,
|
||||
ggml_tensor * causal_mask,
|
||||
ggml_tensor * identity,
|
||||
ggml_tensor * diag_mask,
|
||||
int il);
|
||||
|
||||
// returns pair of output and new state
|
||||
std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_autoregressive(
|
||||
ggml_tensor * q,
|
||||
ggml_tensor * k,
|
||||
ggml_tensor * v,
|
||||
ggml_tensor * g,
|
||||
ggml_tensor * beta,
|
||||
ggml_tensor * state,
|
||||
int il);
|
||||
|
||||
ggml_tensor * build_norm_gated(
|
||||
ggml_tensor * input,
|
||||
ggml_tensor * weights,
|
||||
@@ -604,7 +577,7 @@ private:
|
||||
};
|
||||
|
||||
// TODO: derive llm_build_delta_net_base instead
|
||||
struct llm_build_qwen35moe : public llm_graph_context {
|
||||
struct llm_build_qwen35moe : public llm_build_delta_net_base {
|
||||
llm_build_qwen35moe(const llama_model & model, const llm_graph_params & params);
|
||||
private:
|
||||
ggml_tensor * build_layer_attn(
|
||||
@@ -617,38 +590,12 @@ private:
|
||||
ggml_tensor * build_layer_attn_linear(
|
||||
llm_graph_input_rs * inp,
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * causal_mask,
|
||||
ggml_tensor * identity,
|
||||
ggml_tensor * diag_mask,
|
||||
int il);
|
||||
|
||||
ggml_tensor * build_layer_ffn(
|
||||
ggml_tensor * cur,
|
||||
int il);
|
||||
|
||||
// returns pair of output and new state
|
||||
std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_chunking(
|
||||
ggml_tensor * q,
|
||||
ggml_tensor * k,
|
||||
ggml_tensor * v,
|
||||
ggml_tensor * g,
|
||||
ggml_tensor * beta,
|
||||
ggml_tensor * state,
|
||||
ggml_tensor * causal_mask,
|
||||
ggml_tensor * identity,
|
||||
ggml_tensor * diag_mask,
|
||||
int il);
|
||||
|
||||
// returns pair of output and new state
|
||||
std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_autoregressive(
|
||||
ggml_tensor * q,
|
||||
ggml_tensor * k,
|
||||
ggml_tensor * v,
|
||||
ggml_tensor * g,
|
||||
ggml_tensor * beta,
|
||||
ggml_tensor * state,
|
||||
int il);
|
||||
|
||||
ggml_tensor * build_norm_gated(
|
||||
ggml_tensor * input,
|
||||
ggml_tensor * weights,
|
||||
|
||||
@@ -104,13 +104,6 @@ llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const ll
|
||||
LLM_NORM, -1);
|
||||
cb(cur, "final_norm_out", -1);
|
||||
|
||||
if (hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
|
||||
// extracting cls token
|
||||
cur = ggml_view_1d(ctx0, cur, hparams.n_embd, 0);
|
||||
cb(cur, "cls_pooled_embd", -1);
|
||||
}
|
||||
|
||||
cb(cur, "res_embd", -1);
|
||||
res->t_embd = cur;
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
|
||||
122
src/models/paddleocr.cpp
Normal file
122
src/models/paddleocr.cpp
Normal file
@@ -0,0 +1,122 @@
|
||||
#include "models.h"
|
||||
|
||||
llm_build_paddleocr::llm_build_paddleocr(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
|
||||
// NOTE: same with qwen2vl.cpp, but bias tensors are optional
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
int sections[4];
|
||||
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv();
|
||||
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
{
|
||||
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
}
|
||||
// self-attention
|
||||
{
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
if (model.layers[il].bq) {
|
||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
cb(Qcur, "Qcur", il);
|
||||
}
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
if (model.layers[il].bk) {
|
||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
cb(Kcur, "Kcur", il);
|
||||
}
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
if (model.layers[il].bv) {
|
||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
cb(Vcur, "Vcur", il);
|
||||
}
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
|
||||
Qcur = ggml_rope_multi(
|
||||
ctx0, Qcur, inp_pos, nullptr,
|
||||
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
Kcur = ggml_rope_multi(
|
||||
ctx0, Kcur, inp_pos, nullptr,
|
||||
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// feed-forward network
|
||||
{
|
||||
cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, NULL, NULL,
|
||||
model.layers[il].ffn_gate, NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL, NULL,
|
||||
NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
cur = inpL;
|
||||
|
||||
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
||||
|
||||
cb(cur, "result_norm", -1);
|
||||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
@@ -2,10 +2,8 @@
|
||||
|
||||
#include "llama-memory-recurrent.h"
|
||||
|
||||
#define CHUNK_SIZE 64
|
||||
|
||||
llm_build_qwen35::llm_build_qwen35(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params), model(model) {
|
||||
llm_build_delta_net_base(params), model(model) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
@@ -25,17 +23,6 @@ llm_build_qwen35::llm_build_qwen35(const llama_model & model, const llm_graph_pa
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
ggml_tensor * causal_mask =
|
||||
ggml_tri(ctx0, ggml_fill(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, CHUNK_SIZE, CHUNK_SIZE), 1.0f),
|
||||
GGML_TRI_TYPE_LOWER);
|
||||
|
||||
ggml_tensor * identity = ggml_diag(ctx0, ggml_fill(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, CHUNK_SIZE), 1.0f));
|
||||
ggml_tensor * diag_mask = ggml_add(ctx0, causal_mask, identity);
|
||||
|
||||
ggml_build_forward_expand(gf, causal_mask);
|
||||
ggml_build_forward_expand(gf, identity);
|
||||
ggml_build_forward_expand(gf, diag_mask);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
@@ -45,7 +32,7 @@ llm_build_qwen35::llm_build_qwen35(const llama_model & model, const llm_graph_pa
|
||||
// Determine layer type and build appropriate attention mechanism
|
||||
if (hparams.is_recurrent(il)) {
|
||||
// Linear attention layer (gated delta net)
|
||||
cur = build_layer_attn_linear(inp->get_recr(), cur, causal_mask, identity, diag_mask, il);
|
||||
cur = build_layer_attn_linear(inp->get_recr(), cur, il);
|
||||
} else {
|
||||
// Full attention layer
|
||||
cur = build_layer_attn(inp->get_attn(), cur, inp_pos, sections, il);
|
||||
@@ -95,361 +82,6 @@ llm_build_qwen35::llm_build_qwen35(const llama_model & model, const llm_graph_pa
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
|
||||
// utility to get one slice from the third dimension
|
||||
// input dim: [x, y, c, b]
|
||||
// output dim: [x, y, 1, b]
|
||||
static ggml_tensor * get_slice_2d(ggml_context * ctx0, ggml_tensor * t, int64_t c) {
|
||||
return ggml_view_4d(ctx0, t, t->ne[0], t->ne[1], 1, t->ne[3],
|
||||
t->nb[1], t->nb[2], t->nb[3], t->nb[2] * c);
|
||||
}
|
||||
|
||||
std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen35::build_delta_net_chunking(
|
||||
ggml_tensor * q,
|
||||
ggml_tensor * k,
|
||||
ggml_tensor * v,
|
||||
ggml_tensor * g,
|
||||
ggml_tensor * beta,
|
||||
ggml_tensor * state,
|
||||
ggml_tensor * causal_mask,
|
||||
ggml_tensor * identity,
|
||||
ggml_tensor * diag_mask,
|
||||
int il) {
|
||||
const int64_t S_k = q->ne[0];
|
||||
const int64_t H_k = q->ne[1];
|
||||
const int64_t n_tokens = q->ne[2];
|
||||
const int64_t n_seqs = q->ne[3];
|
||||
|
||||
const int64_t S_v = v->ne[0];
|
||||
const int64_t H_v = v->ne[1];
|
||||
|
||||
GGML_ASSERT(v->ne[2] == n_tokens);
|
||||
GGML_ASSERT(k->ne[2] == n_tokens);
|
||||
GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
|
||||
GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
|
||||
GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v * H_v && state->ne[2] == 1 && state->ne[3] == n_seqs);
|
||||
|
||||
GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
|
||||
GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
|
||||
|
||||
GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
|
||||
|
||||
const float eps_norm = hparams.f_norm_rms_eps;
|
||||
|
||||
q = ggml_l2_norm(ctx0, q, eps_norm);
|
||||
k = ggml_l2_norm(ctx0, k, eps_norm);
|
||||
|
||||
const float scale = 1.0f / sqrtf(S_v);
|
||||
|
||||
q = ggml_scale(ctx0, q, scale);
|
||||
|
||||
beta = ggml_sigmoid(ctx0, beta);
|
||||
|
||||
cb(q, "q_in", il);
|
||||
cb(k, "k_in", il);
|
||||
cb(v, "v_in", il);
|
||||
cb(beta, "beta_in", il);
|
||||
cb(g, "g_in", il);
|
||||
|
||||
q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
|
||||
k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
|
||||
v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
|
||||
g = ggml_cont_4d(ctx0, ggml_permute(ctx0, g, 2, 0, 3, 1), n_tokens, 1, H_k, n_seqs);
|
||||
|
||||
beta = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3));
|
||||
state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
|
||||
|
||||
cb(q, "q_perm", il);
|
||||
cb(k, "k_perm", il);
|
||||
cb(v, "v_perm", il);
|
||||
cb(beta, "beta_perm", il);
|
||||
cb(g, "g_perm", il);
|
||||
cb(state, "state_in", il);
|
||||
|
||||
GGML_ASSERT(q->ne[1] == n_tokens && q->ne[0] == S_k && q->ne[2] == H_k && q->ne[3] == n_seqs);
|
||||
GGML_ASSERT(k->ne[1] == n_tokens && k->ne[0] == S_k && k->ne[2] == H_k && k->ne[3] == n_seqs);
|
||||
GGML_ASSERT(v->ne[1] == n_tokens && v->ne[0] == S_v && v->ne[2] == H_k && v->ne[3] == n_seqs);
|
||||
GGML_ASSERT(beta->ne[1] == n_tokens && beta->ne[2] == H_k && beta->ne[0] == 1 && beta->ne[3] == n_seqs);
|
||||
|
||||
// Do padding
|
||||
const int64_t chunk_size = CHUNK_SIZE;
|
||||
|
||||
const int64_t pad = (chunk_size - n_tokens % chunk_size) % chunk_size;
|
||||
const int64_t n_chunks = (n_tokens + pad) / chunk_size;
|
||||
|
||||
q = ggml_pad(ctx0, q, 0, pad, 0, 0);
|
||||
k = ggml_pad(ctx0, k, 0, pad, 0, 0);
|
||||
v = ggml_pad(ctx0, v, 0, pad, 0, 0);
|
||||
g = ggml_pad(ctx0, g, pad, 0, 0, 0);
|
||||
beta = ggml_pad(ctx0, beta, 0, pad, 0, 0);
|
||||
|
||||
cb(q, "q_pad", il);
|
||||
cb(k, "k_pad", il);
|
||||
cb(v, "v_pad", il);
|
||||
cb(beta, "beta_pad", il);
|
||||
cb(g, "g_pad", il);
|
||||
|
||||
ggml_tensor * v_beta = ggml_mul(ctx0, v, beta);
|
||||
ggml_tensor * k_beta = ggml_mul(ctx0, k, beta);
|
||||
|
||||
cb(v_beta, "v_beta", il);
|
||||
cb(k_beta, "k_beta", il);
|
||||
|
||||
q = ggml_reshape_4d(ctx0, q, S_k, chunk_size, n_chunks, H_k * n_seqs);
|
||||
k = ggml_reshape_4d(ctx0, k, S_k, chunk_size, n_chunks, H_k * n_seqs);
|
||||
k_beta = ggml_reshape_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, H_k * n_seqs);
|
||||
v = ggml_reshape_4d(ctx0, v, S_v, chunk_size, n_chunks, H_v * n_seqs);
|
||||
v_beta = ggml_reshape_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, H_v * n_seqs);
|
||||
|
||||
g = ggml_reshape_4d(ctx0, g, chunk_size, 1, n_chunks, H_k * n_seqs);
|
||||
beta = ggml_reshape_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs);
|
||||
|
||||
ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g);
|
||||
cb(g_cumsum, "g_cumsum", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
|
||||
|
||||
ggml_tensor * gcs_i = g_cumsum; // ggml_reshape_4d(ctx0, g_cumsum, chunk_size, 1, n_chunks, H_v * n_seqs);
|
||||
ggml_tensor * gcs_j = ggml_reshape_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_v * n_seqs);
|
||||
|
||||
ggml_tensor * gcs_j_broadcast =
|
||||
ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, n_chunks, H_v * n_seqs);
|
||||
|
||||
ggml_tensor * decay_mask = ggml_sub(ctx0, gcs_j_broadcast, gcs_i);
|
||||
cb(decay_mask, "decay_mask", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
|
||||
|
||||
decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
|
||||
decay_mask = ggml_exp(ctx0, decay_mask);
|
||||
decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
|
||||
|
||||
ggml_tensor * kmulkbeta = ggml_mul_mat(ctx0, k, k_beta);
|
||||
|
||||
ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask);
|
||||
ggml_tensor * attn = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask));
|
||||
cb(attn, "attn_pre_solve", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
|
||||
|
||||
ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask);
|
||||
ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower);
|
||||
|
||||
ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
|
||||
attn = ggml_mul(ctx0, lin_solve, causal_mask);
|
||||
attn = ggml_add(ctx0, attn, identity);
|
||||
cb(attn, "attn_solved", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
|
||||
|
||||
v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), attn);
|
||||
|
||||
ggml_tensor * g_cumsum_t = ggml_cont(ctx0, ggml_transpose(ctx0, g_cumsum));
|
||||
ggml_tensor * gexp = ggml_exp(ctx0, g_cumsum_t);
|
||||
|
||||
ggml_tensor * kbeta_gexp = ggml_mul(ctx0, k_beta, gexp);
|
||||
cb(kbeta_gexp, "kbeta_gexp", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs)
|
||||
|
||||
ggml_tensor * k_cumdecay =
|
||||
ggml_cont(ctx0, ggml_transpose(ctx0, ggml_mul_mat(ctx0, attn, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gexp)))));
|
||||
cb(k_cumdecay, "k_cumdecay", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
|
||||
|
||||
ggml_tensor * attn_kq = ggml_mul_mat(ctx0, k, q);
|
||||
attn_kq = ggml_mul(ctx0, attn_kq, decay_mask);
|
||||
attn_kq = ggml_mul(ctx0, attn_kq, diag_mask);
|
||||
cb(attn_kq, "attn_kq", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
|
||||
|
||||
|
||||
// vectorized calculation of key_gdiff
|
||||
// improved from the chunked version:
|
||||
// g_last = torch.clamp(g_cum[:, :, -1], max=50.0).exp().unsqueeze(-1).unsqueeze(-1)
|
||||
// g_diff = torch.clamp(g_cum[:, :, -1:] - g_cum, max=50.0).exp()
|
||||
// key_gdiff = key * g_diff.unsqueeze(-1)
|
||||
// kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
|
||||
// last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
|
||||
|
||||
// get last element in g_cumsum along chunk_size dimension (ne0)
|
||||
// example: [[x, y, z, ..., last], ...] -> [[last], ...]
|
||||
ggml_tensor * g_last = ggml_view_4d(ctx0, g_cumsum, 1, 1, g_cumsum->ne[2], g_cumsum->ne[3],
|
||||
g_cumsum->nb[1], g_cumsum->nb[2], g_cumsum->nb[3],
|
||||
(g_cumsum->ne[0] - 1) * ggml_element_size(g_cumsum));
|
||||
g_last = ggml_cont(ctx0, g_last);
|
||||
cb(g_last, "g_last", il); // shape: (1, 1, n_chunks, H_v * n_seqs)
|
||||
|
||||
ggml_tensor * g_last_exp = ggml_exp(ctx0, g_last);
|
||||
cb(g_last_exp, "g_last_exp", il); // shape: (1, 1, n_chunks, H_v * n_seqs)
|
||||
|
||||
ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cumsum, g_last));
|
||||
cb(g_diff, "g_diff", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
|
||||
|
||||
ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff);
|
||||
ggml_tensor * g_diff_exp_t = ggml_reshape_4d(ctx0, g_diff_exp,
|
||||
1, chunk_size, n_chunks, g_diff_exp->ne[3]);
|
||||
|
||||
ggml_tensor * key_gdiff = ggml_mul(ctx0, k, g_diff_exp_t);
|
||||
cb(key_gdiff, "key_gdiff", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs)
|
||||
|
||||
ggml_tensor * key_gdiff_t = ggml_cont(ctx0, ggml_transpose(ctx0, key_gdiff));
|
||||
cb(key_gdiff_t, "key_gdiff_t", il); // shape: (chunk_size, S_k, n_chunks, H_v * n_seqs)
|
||||
|
||||
// state to be updated per chunk
|
||||
ggml_tensor * new_state = state; // ggml_dup(ctx0, state);
|
||||
cb(new_state, "new_state", il); // shape: (S_v, S_v, H_v, n_seqs)
|
||||
|
||||
// shape after loop of chunks: (S_v, chunk_size, n_chunks, H_v * n_seqs)
|
||||
ggml_tensor * core_attn_out = nullptr;
|
||||
|
||||
for (int64_t chunk = 0; chunk < n_chunks; chunk++) {
|
||||
// shape: (S_k, chunk_size, 1, H_k * n_seqs)
|
||||
ggml_tensor * q_chunk = get_slice_2d(ctx0, q, chunk); // (no cont), next op: ggml_mul
|
||||
|
||||
// shape: (S_v, chunk_size, 1, H_v * n_seqs)
|
||||
ggml_tensor * v_chunk = get_slice_2d(ctx0, v, chunk); // (no cont), next op: ggml_repeat
|
||||
|
||||
// shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
|
||||
ggml_tensor * gexp_chunk = get_slice_2d(ctx0, gexp, chunk); // (no cont), next op: ggml_mul
|
||||
|
||||
// shape: (chunk_size, 1, H_v * n_seqs)
|
||||
ggml_tensor * k_cumdecay_chunk = get_slice_2d(ctx0, k_cumdecay, chunk); // (no cont), next op: ggml_mul_mat
|
||||
|
||||
// attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
|
||||
// replaced by precomputed attn_kq
|
||||
ggml_tensor * attn_chunk = get_slice_2d(ctx0, attn_kq, chunk);
|
||||
cb(attn_chunk, "attn_chunk", il);
|
||||
|
||||
ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs);
|
||||
|
||||
// v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state
|
||||
ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay_chunk);
|
||||
cb(v_prime, "v_prime_chunk", il); // shape: (S_v, 1, H_v * n_seqs)
|
||||
|
||||
// v_new = v_i - v_prime
|
||||
ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, v_chunk, v_prime), v_prime);
|
||||
ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new));
|
||||
cb(v_new, "v_new_chunk", il);
|
||||
|
||||
// attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
|
||||
ggml_tensor * q_g_exp = ggml_mul(ctx0, q_chunk, gexp_chunk);
|
||||
ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_g_exp);
|
||||
cb(attn_inter, "attn_inter_chunk", il);
|
||||
|
||||
// core_attn_out[:, :, i] = attn_inter + attn @ v_new
|
||||
ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, attn_chunk);
|
||||
cb(v_attn, "v_attn_chunk", il);
|
||||
|
||||
ggml_tensor * core_attn_out_chunk = ggml_add(ctx0, attn_inter, v_attn);
|
||||
cb(core_attn_out_chunk, "core_attn_out_chunk", il); // shape: (S_v, chunk_size, 1, H_v * n_seqs)
|
||||
|
||||
core_attn_out = core_attn_out == nullptr
|
||||
? core_attn_out_chunk
|
||||
: ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 2);
|
||||
|
||||
// kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
|
||||
ggml_tensor * k_gdiff_t = get_slice_2d(ctx0, key_gdiff_t, chunk);
|
||||
//ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, k_gdiff, v_new); // this is slower on metal, why?
|
||||
ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, k_gdiff_t);
|
||||
|
||||
// last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
|
||||
ggml_tensor * gexp_last_chunk = ggml_cont(ctx0, get_slice_2d(ctx0, g_last_exp, chunk));
|
||||
new_state = ggml_add(ctx0,
|
||||
ggml_mul(ctx0, new_state, ggml_reshape_4d(ctx0, gexp_last_chunk, gexp_last_chunk->ne[0], gexp_last_chunk->ne[1], H_v, n_seqs)),
|
||||
ggml_reshape_4d(ctx0, kgdmulvnew, kgdmulvnew->ne[0], kgdmulvnew->ne[1], H_v, n_seqs));
|
||||
}
|
||||
|
||||
// truncate padded tokens
|
||||
ggml_tensor * output_tokens = ggml_view_4d(ctx0, core_attn_out,
|
||||
S_v, n_tokens, H_v, n_seqs,
|
||||
ggml_row_size(core_attn_out->type, S_v),
|
||||
ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks),
|
||||
ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks * H_v), 0);
|
||||
output_tokens = ggml_cont(ctx0, output_tokens);
|
||||
cb(output_tokens, "output_tokens", il);
|
||||
|
||||
// permute back to (S_v, H_v, n_tokens, n_seqs)
|
||||
output_tokens = ggml_permute(ctx0, output_tokens, 0, 2, 1, 3);
|
||||
output_tokens = ggml_cont(ctx0, output_tokens);
|
||||
|
||||
return {output_tokens, new_state};
|
||||
}
|
||||
|
||||
std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen35::build_delta_net_autoregressive(
|
||||
ggml_tensor * q,
|
||||
ggml_tensor * k,
|
||||
ggml_tensor * v,
|
||||
ggml_tensor * g,
|
||||
ggml_tensor * beta,
|
||||
ggml_tensor * state,
|
||||
int il) {
|
||||
const int64_t S_k = q->ne[0];
|
||||
const int64_t H_k = q->ne[1];
|
||||
const int64_t n_tokens = q->ne[2];
|
||||
const int64_t n_seqs = q->ne[3];
|
||||
|
||||
const int64_t S_v = v->ne[0];
|
||||
const int64_t H_v = v->ne[1];
|
||||
|
||||
GGML_ASSERT(n_tokens == 1); // This function is optimized for single token processing
|
||||
GGML_ASSERT(v->ne[2] == n_tokens);
|
||||
GGML_ASSERT(k->ne[2] == n_tokens);
|
||||
GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
|
||||
GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
|
||||
GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v * H_v && state->ne[2] == 1 && state->ne[3] == n_seqs);
|
||||
|
||||
GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
|
||||
GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
|
||||
|
||||
GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
|
||||
|
||||
const float eps_norm = hparams.f_norm_rms_eps;
|
||||
|
||||
q = ggml_l2_norm(ctx0, q, eps_norm);
|
||||
k = ggml_l2_norm(ctx0, k, eps_norm);
|
||||
|
||||
const float scale = 1.0f / sqrtf(S_v);
|
||||
|
||||
q = ggml_scale(ctx0, q, scale);
|
||||
beta = ggml_sigmoid(ctx0, beta);
|
||||
|
||||
cb(q, "q_in", il);
|
||||
cb(k, "k_in", il);
|
||||
cb(v, "v_in", il);
|
||||
cb(beta, "beta_in", il);
|
||||
cb(g, "g_in", il);
|
||||
|
||||
state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
|
||||
|
||||
ggml_tensor * g_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, g), 1, 1, H_k, n_seqs);
|
||||
ggml_tensor * beta_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, beta), 1, 1, H_k, n_seqs);
|
||||
|
||||
// Apply exponential to g_t
|
||||
g_t = ggml_exp(ctx0, g_t);
|
||||
|
||||
// Apply the gated delta rule for the single timestep
|
||||
// last_recurrent_state = last_recurrent_state * g_t
|
||||
state = ggml_mul(ctx0, state, g_t);
|
||||
|
||||
// kv_mem = (last_recurrent_state * k_t.unsqueeze(-1)).sum(dim=-2)
|
||||
ggml_tensor * k_t_unsqueezed = ggml_reshape_4d(ctx0, k, 1, S_v, H_v, n_seqs);
|
||||
ggml_tensor * kv_mem = ggml_mul(ctx0, state, k_t_unsqueezed);
|
||||
// we need to sum over dim=-2, so we transpose, sum, then transpose again
|
||||
kv_mem = ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, kv_mem))));
|
||||
|
||||
// v_t = v.unsqueeze(2) (we insert the singleton dimension after n_seqs and H_v)
|
||||
ggml_tensor * v_t = ggml_reshape_4d(ctx0, v, S_v, 1, H_v, n_seqs);
|
||||
// delta = (v_t - kv_mem) * beta_t
|
||||
ggml_tensor * v_diff = ggml_sub(ctx0, v_t, kv_mem); // both should be [S_v, 1, H_v, n_seqs]
|
||||
ggml_tensor * delta = ggml_mul(ctx0, v_diff, beta_t);
|
||||
|
||||
// last_recurrent_state = last_recurrent_state + k_t.unsqueeze(-1) * delta
|
||||
ggml_tensor * k_t_delta = ggml_mul(ctx0, ggml_repeat_4d(ctx0, k_t_unsqueezed, S_v, S_v, H_v, n_seqs), delta);
|
||||
state = ggml_add(ctx0, state, k_t_delta);
|
||||
|
||||
// Compute the attention output
|
||||
// core_attn_out = (last_recurrent_state * q_t.unsqueeze(-1)).sum(dim=-2)
|
||||
ggml_tensor * q_t_unsqueezed = ggml_reshape_4d(ctx0, q, 1, S_v, H_v, n_seqs); // unsqueeze q_t
|
||||
ggml_tensor * state_q = ggml_mul(ctx0, state, q_t_unsqueezed);
|
||||
// again, since it's over dim = -2, transpose, sum, transpose back
|
||||
ggml_tensor * core_attn_out =
|
||||
ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, state_q))));
|
||||
|
||||
// core_attn_out should be [S_v, 1, H_v, n_seqs] after this
|
||||
cb(core_attn_out, "output_tokens", il);
|
||||
cb(state, "new_state", il);
|
||||
|
||||
return {core_attn_out, state};
|
||||
}
|
||||
|
||||
std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen35::build_qkvz(
|
||||
ggml_tensor * input,
|
||||
int il) {
|
||||
@@ -561,9 +193,6 @@ ggml_tensor * llm_build_qwen35::build_layer_attn(
|
||||
ggml_tensor * llm_build_qwen35::build_layer_attn_linear(
|
||||
llm_graph_input_rs * inp,
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * causal_mask,
|
||||
ggml_tensor * identity,
|
||||
ggml_tensor * diag_mask,
|
||||
int il) {
|
||||
const auto * mctx_cur = inp->mctx;
|
||||
|
||||
@@ -587,8 +216,11 @@ ggml_tensor * llm_build_qwen35::build_layer_attn_linear(
|
||||
ggml_tensor * z = qkvz.second;
|
||||
|
||||
ggml_tensor * beta = build_lora_mm(model.layers[il].ssm_beta, cur);
|
||||
beta = ggml_reshape_4d(ctx0, beta, num_v_heads, 1, n_seq_tokens, n_seqs);
|
||||
beta = ggml_reshape_4d(ctx0, beta, 1, num_v_heads, n_seq_tokens, n_seqs);
|
||||
cb(beta, "beta", il);
|
||||
|
||||
beta = ggml_sigmoid(ctx0, beta);
|
||||
|
||||
ggml_tensor * alpha = build_lora_mm(model.layers[il].ssm_alpha, cur);
|
||||
alpha = ggml_cont_3d(ctx0, alpha, num_v_heads, n_seq_tokens, n_seqs);
|
||||
cb(alpha, "alpha", il);
|
||||
@@ -596,15 +228,16 @@ ggml_tensor * llm_build_qwen35::build_layer_attn_linear(
|
||||
ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt);
|
||||
ggml_tensor * alpha_softplus = ggml_softplus(ctx0, alpha_biased);
|
||||
cb(alpha_softplus, "a_softplus", il);
|
||||
|
||||
ggml_tensor * gate = ggml_mul(ctx0, alpha_softplus, model.layers[il].ssm_a); // -A_log.exp() * softplus
|
||||
cb(gate, "gate", il);
|
||||
|
||||
gate = ggml_reshape_4d(ctx0, gate, 1, num_v_heads, n_seq_tokens, n_seqs);
|
||||
|
||||
// Get convolution states from cache
|
||||
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
|
||||
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
|
||||
|
||||
// bool use_precomputed_states = n_seq_tokens == 1 && mctx_cur->has_previous_state();
|
||||
|
||||
// Build the convolution states tensor
|
||||
ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
|
||||
cb(conv_states, "conv_states", il);
|
||||
@@ -613,11 +246,12 @@ ggml_tensor * llm_build_qwen35::build_layer_attn_linear(
|
||||
ggml_tensor * conv_kernel = model.layers[il].ssm_conv1d;
|
||||
const int64_t conv_kernel_size = conv_kernel->ne[0];
|
||||
const int64_t conv_channels = d_inner + 2 * hparams.ssm_n_group * hparams.ssm_d_state;
|
||||
conv_states = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, conv_channels, n_seqs);
|
||||
|
||||
conv_states = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, conv_channels, n_seqs);
|
||||
cb(conv_states, "conv_states_reshaped", il);
|
||||
|
||||
qkv_mixed = ggml_permute(ctx0, qkv_mixed, 1, 0, 2, 3);
|
||||
cb(qkv_mixed, "qkv_mixed_permuted", il);
|
||||
qkv_mixed = ggml_transpose(ctx0, qkv_mixed);
|
||||
cb(qkv_mixed, "qkv_mixed_transposed", il);
|
||||
|
||||
ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0);
|
||||
cb(conv_input, "conv_input", il);
|
||||
@@ -637,7 +271,10 @@ ggml_tensor * llm_build_qwen35::build_layer_attn_linear(
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, state_update_target));
|
||||
cb(conv_states_all, "conv_states_updated", il);
|
||||
|
||||
// Apply SSM convolution
|
||||
ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs);
|
||||
state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim, num_v_heads, n_seqs);
|
||||
cb(state, "state_predelta", il);
|
||||
|
||||
ggml_tensor * conv_output_proper = ggml_ssm_conv(ctx0, conv_input, conv_kernel);
|
||||
cb(conv_output_proper, "conv_output_raw", il);
|
||||
|
||||
@@ -651,31 +288,41 @@ ggml_tensor * llm_build_qwen35::build_layer_attn_linear(
|
||||
int64_t nb1_qkv = ggml_row_size(conv_qkv_mix->type, qkv_dim);
|
||||
|
||||
// Extract the convolved Q, K, V from conv_output
|
||||
ggml_tensor * q_conv =
|
||||
ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, nb1_qkv, 0);
|
||||
ggml_tensor * q_conv = ggml_view_4d(ctx0, conv_qkv_mix, head_k_dim, num_k_heads, n_seq_tokens, n_seqs,
|
||||
ggml_row_size(conv_qkv_mix->type, head_k_dim),
|
||||
nb1_qkv,
|
||||
nb1_qkv * n_seq_tokens,
|
||||
0);
|
||||
|
||||
ggml_tensor * k_conv = ggml_view_4d(ctx0, conv_qkv_mix, head_k_dim, num_k_heads, n_seq_tokens, n_seqs,
|
||||
ggml_row_size(conv_qkv_mix->type, head_k_dim),
|
||||
nb1_qkv,
|
||||
nb1_qkv * n_seq_tokens,
|
||||
head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
|
||||
|
||||
ggml_tensor * v_conv = ggml_view_4d(ctx0, conv_qkv_mix, head_v_dim, num_v_heads, n_seq_tokens, n_seqs,
|
||||
ggml_row_size(conv_qkv_mix->type, head_v_dim),
|
||||
nb1_qkv,
|
||||
nb1_qkv * n_seq_tokens,
|
||||
ggml_row_size(conv_qkv_mix->type, 2 * head_k_dim * num_k_heads));
|
||||
|
||||
cb(q_conv, "q_conv", il);
|
||||
ggml_tensor * k_conv =
|
||||
ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, nb1_qkv,
|
||||
head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
|
||||
cb(k_conv, "k_conv", il);
|
||||
ggml_tensor * v_conv =
|
||||
ggml_view_2d(ctx0, conv_qkv_mix, head_v_dim * num_v_heads, n_seq_tokens * n_seqs, nb1_qkv,
|
||||
2 * head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
|
||||
cb(v_conv, "v_conv", il);
|
||||
|
||||
// Unsqueeze them
|
||||
q_conv = ggml_cont_4d(ctx0, q_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
|
||||
k_conv = ggml_cont_4d(ctx0, k_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
|
||||
v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
|
||||
const float eps_norm = hparams.f_norm_rms_eps;
|
||||
|
||||
ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs);
|
||||
state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim * num_v_heads, 1, n_seqs);
|
||||
cb(state, "state_predelta", il);
|
||||
q_conv = ggml_l2_norm(ctx0, q_conv, eps_norm);
|
||||
k_conv = ggml_l2_norm(ctx0, k_conv, eps_norm);
|
||||
|
||||
// if head keys and value keys are different, repeat Q/K to match V's head count
|
||||
// V heads are in tiled order (from conversion), so simple tiled repeat works
|
||||
//q_conv = ggml_cont_4d(ctx0, q_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
|
||||
//k_conv = ggml_cont_4d(ctx0, k_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
|
||||
//v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
|
||||
|
||||
// if head keys and value keys are different, repeat to force tensors into matching shapes
|
||||
if (num_k_heads != num_v_heads) {
|
||||
GGML_ASSERT(num_v_heads % num_k_heads == 0);
|
||||
// TODO: try to avoid these explicit repeats by utilizing op broadcast
|
||||
q_conv = ggml_repeat_4d(ctx0, q_conv, head_k_dim, num_v_heads, n_seq_tokens, n_seqs);
|
||||
k_conv = ggml_repeat_4d(ctx0, k_conv, head_k_dim, num_v_heads, n_seq_tokens, n_seqs);
|
||||
}
|
||||
@@ -689,7 +336,7 @@ ggml_tensor * llm_build_qwen35::build_layer_attn_linear(
|
||||
if (n_seq_tokens == 1) {
|
||||
attn_out = build_delta_net_autoregressive(q_conv, k_conv, v_conv, gate, beta, state, il);
|
||||
} else {
|
||||
attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, diag_mask, il);
|
||||
attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, il);
|
||||
}
|
||||
ggml_tensor * output = attn_out.first;
|
||||
ggml_tensor * new_state = attn_out.second;
|
||||
@@ -698,19 +345,15 @@ ggml_tensor * llm_build_qwen35::build_layer_attn_linear(
|
||||
|
||||
// Update the recurrent states
|
||||
ggml_build_forward_expand(gf,
|
||||
ggml_cpy(ctx0, new_state,
|
||||
ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs,
|
||||
kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all))));
|
||||
|
||||
// Reshape both attn_out_final and z to 2D tensors for normalization
|
||||
// attn_out_final: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]
|
||||
ggml_tensor * attn_out_2d_final = ggml_reshape_2d(ctx0, output, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);
|
||||
ggml_cpy(ctx0, new_state,
|
||||
ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs,
|
||||
kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all))));
|
||||
|
||||
// z: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]
|
||||
ggml_tensor * z_2d = ggml_reshape_2d(ctx0, z, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);
|
||||
ggml_tensor * z_2d = ggml_reshape_4d(ctx0, z, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
|
||||
|
||||
// Apply gated normalization: self.norm(core_attn_out, z)
|
||||
ggml_tensor * attn_out_norm = build_norm_gated(attn_out_2d_final, model.layers[il].ssm_norm, z_2d, il);
|
||||
ggml_tensor * attn_out_norm = build_norm_gated(output, model.layers[il].ssm_norm, z_2d, il);
|
||||
|
||||
// Final reshape: [head_dim, n_heads, n_tokens, n_seqs] -> [n_tokens, n_seqs, n_heads * head_dim]
|
||||
ggml_tensor * final_output = ggml_reshape_3d(ctx0, attn_out_norm, head_v_dim * num_v_heads, n_seq_tokens, n_seqs);
|
||||
@@ -721,7 +364,8 @@ ggml_tensor * llm_build_qwen35::build_layer_attn_linear(
|
||||
cb(cur, "linear_attn_out", il);
|
||||
|
||||
// Reshape back to original dimensions
|
||||
cur = ggml_cont_2d(ctx0, cur, n_embd, n_seq_tokens * n_seqs);
|
||||
cur = ggml_reshape_2d(ctx0, cur, n_embd, n_seq_tokens * n_seqs);
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
|
||||
@@ -2,10 +2,8 @@
|
||||
|
||||
#include "llama-memory-recurrent.h"
|
||||
|
||||
#define CHUNK_SIZE 64
|
||||
|
||||
llm_build_qwen35moe::llm_build_qwen35moe(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params), model(model) {
|
||||
llm_build_delta_net_base(params), model(model) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
@@ -25,17 +23,6 @@ llm_build_qwen35moe::llm_build_qwen35moe(const llama_model & model, const llm_gr
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
ggml_tensor * causal_mask =
|
||||
ggml_tri(ctx0, ggml_fill(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, CHUNK_SIZE, CHUNK_SIZE), 1.0f),
|
||||
GGML_TRI_TYPE_LOWER);
|
||||
|
||||
ggml_tensor * identity = ggml_diag(ctx0, ggml_fill(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, CHUNK_SIZE), 1.0f));
|
||||
ggml_tensor * diag_mask = ggml_add(ctx0, causal_mask, identity);
|
||||
|
||||
ggml_build_forward_expand(gf, causal_mask);
|
||||
ggml_build_forward_expand(gf, identity);
|
||||
ggml_build_forward_expand(gf, diag_mask);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
@@ -45,7 +32,7 @@ llm_build_qwen35moe::llm_build_qwen35moe(const llama_model & model, const llm_gr
|
||||
// Determine layer type and build appropriate attention mechanism
|
||||
if (hparams.is_recurrent(il)) {
|
||||
// Linear attention layer (gated delta net)
|
||||
cur = build_layer_attn_linear(inp->get_recr(), cur, causal_mask, identity, diag_mask, il);
|
||||
cur = build_layer_attn_linear(inp->get_recr(), cur, il);
|
||||
} else {
|
||||
// Full attention layer
|
||||
cur = build_layer_attn(inp->get_attn(), cur, inp_pos, sections, il);
|
||||
@@ -95,362 +82,6 @@ llm_build_qwen35moe::llm_build_qwen35moe(const llama_model & model, const llm_gr
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
|
||||
// utility to get one slice from the third dimension
|
||||
// input dim: [x, y, c, b]
|
||||
// output dim: [x, y, 1, b]
|
||||
static ggml_tensor * get_slice_2d(ggml_context * ctx0, ggml_tensor * t, int64_t c) {
|
||||
return ggml_view_4d(ctx0, t, t->ne[0], t->ne[1], 1, t->ne[3],
|
||||
t->nb[1], t->nb[2], t->nb[3], t->nb[2] * c);
|
||||
}
|
||||
|
||||
std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen35moe::build_delta_net_chunking(
|
||||
ggml_tensor * q,
|
||||
ggml_tensor * k,
|
||||
ggml_tensor * v,
|
||||
ggml_tensor * g,
|
||||
ggml_tensor * beta,
|
||||
ggml_tensor * state,
|
||||
ggml_tensor * causal_mask,
|
||||
ggml_tensor * identity,
|
||||
ggml_tensor * diag_mask,
|
||||
int il) {
|
||||
const int64_t S_k = q->ne[0];
|
||||
const int64_t H_k = q->ne[1];
|
||||
const int64_t n_tokens = q->ne[2];
|
||||
const int64_t n_seqs = q->ne[3];
|
||||
|
||||
const int64_t S_v = v->ne[0];
|
||||
const int64_t H_v = v->ne[1];
|
||||
|
||||
GGML_ASSERT(v->ne[2] == n_tokens);
|
||||
GGML_ASSERT(k->ne[2] == n_tokens);
|
||||
GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
|
||||
GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
|
||||
GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v * H_v && state->ne[2] == 1 && state->ne[3] == n_seqs);
|
||||
|
||||
GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
|
||||
GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
|
||||
|
||||
GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
|
||||
|
||||
const float eps_norm = hparams.f_norm_rms_eps;
|
||||
|
||||
q = ggml_l2_norm(ctx0, q, eps_norm);
|
||||
k = ggml_l2_norm(ctx0, k, eps_norm);
|
||||
|
||||
const float scale = 1.0f / sqrtf(S_v);
|
||||
|
||||
q = ggml_scale(ctx0, q, scale);
|
||||
|
||||
beta = ggml_sigmoid(ctx0, beta);
|
||||
|
||||
cb(q, "q_in", il);
|
||||
cb(k, "k_in", il);
|
||||
cb(v, "v_in", il);
|
||||
cb(beta, "beta_in", il);
|
||||
cb(g, "g_in", il);
|
||||
|
||||
q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
|
||||
k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
|
||||
v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
|
||||
g = ggml_cont_4d(ctx0, ggml_permute(ctx0, g, 2, 0, 3, 1), n_tokens, 1, H_k, n_seqs);
|
||||
|
||||
beta = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3));
|
||||
state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
|
||||
|
||||
cb(q, "q_perm", il);
|
||||
cb(k, "k_perm", il);
|
||||
cb(v, "v_perm", il);
|
||||
cb(beta, "beta_perm", il);
|
||||
cb(g, "g_perm", il);
|
||||
cb(state, "state_in", il);
|
||||
|
||||
GGML_ASSERT(q->ne[1] == n_tokens && q->ne[0] == S_k && q->ne[2] == H_k && q->ne[3] == n_seqs);
|
||||
GGML_ASSERT(k->ne[1] == n_tokens && k->ne[0] == S_k && k->ne[2] == H_k && k->ne[3] == n_seqs);
|
||||
GGML_ASSERT(v->ne[1] == n_tokens && v->ne[0] == S_v && v->ne[2] == H_k && v->ne[3] == n_seqs);
|
||||
GGML_ASSERT(beta->ne[1] == n_tokens && beta->ne[2] == H_k && beta->ne[0] == 1 && beta->ne[3] == n_seqs);
|
||||
|
||||
// Do padding
|
||||
const int64_t chunk_size = CHUNK_SIZE;
|
||||
|
||||
const int64_t pad = (chunk_size - n_tokens % chunk_size) % chunk_size;
|
||||
const int64_t n_chunks = (n_tokens + pad) / chunk_size;
|
||||
|
||||
q = ggml_pad(ctx0, q, 0, pad, 0, 0);
|
||||
k = ggml_pad(ctx0, k, 0, pad, 0, 0);
|
||||
v = ggml_pad(ctx0, v, 0, pad, 0, 0);
|
||||
g = ggml_pad(ctx0, g, pad, 0, 0, 0);
|
||||
beta = ggml_pad(ctx0, beta, 0, pad, 0, 0);
|
||||
|
||||
cb(q, "q_pad", il);
|
||||
cb(k, "k_pad", il);
|
||||
cb(v, "v_pad", il);
|
||||
cb(beta, "beta_pad", il);
|
||||
cb(g, "g_pad", il);
|
||||
|
||||
ggml_tensor * v_beta = ggml_mul(ctx0, v, beta);
|
||||
ggml_tensor * k_beta = ggml_mul(ctx0, k, beta);
|
||||
|
||||
cb(v_beta, "v_beta", il);
|
||||
cb(k_beta, "k_beta", il);
|
||||
|
||||
q = ggml_reshape_4d(ctx0, q, S_k, chunk_size, n_chunks, H_k * n_seqs);
|
||||
k = ggml_reshape_4d(ctx0, k, S_k, chunk_size, n_chunks, H_k * n_seqs);
|
||||
k_beta = ggml_reshape_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, H_k * n_seqs);
|
||||
v = ggml_reshape_4d(ctx0, v, S_v, chunk_size, n_chunks, H_v * n_seqs);
|
||||
v_beta = ggml_reshape_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, H_v * n_seqs);
|
||||
|
||||
g = ggml_reshape_4d(ctx0, g, chunk_size, 1, n_chunks, H_k * n_seqs);
|
||||
beta = ggml_reshape_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs);
|
||||
|
||||
ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g);
|
||||
cb(g_cumsum, "g_cumsum", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
|
||||
|
||||
ggml_tensor * gcs_i = g_cumsum; // ggml_reshape_4d(ctx0, g_cumsum, chunk_size, 1, n_chunks, H_v * n_seqs);
|
||||
ggml_tensor * gcs_j = ggml_reshape_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_v * n_seqs);
|
||||
|
||||
ggml_tensor * gcs_j_broadcast =
|
||||
ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, n_chunks, H_v * n_seqs);
|
||||
|
||||
ggml_tensor * decay_mask = ggml_sub(ctx0, gcs_j_broadcast, gcs_i);
|
||||
cb(decay_mask, "decay_mask", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
|
||||
|
||||
decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
|
||||
decay_mask = ggml_exp(ctx0, decay_mask);
|
||||
decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
|
||||
|
||||
ggml_tensor * kmulkbeta = ggml_mul_mat(ctx0, k, k_beta);
|
||||
|
||||
ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask);
|
||||
ggml_tensor * attn = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask));
|
||||
cb(attn, "attn_pre_solve", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
|
||||
|
||||
ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask);
|
||||
ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower);
|
||||
|
||||
ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
|
||||
attn = ggml_mul(ctx0, lin_solve, causal_mask);
|
||||
attn = ggml_add(ctx0, attn, identity);
|
||||
cb(attn, "attn_solved", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
|
||||
|
||||
v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), attn);
|
||||
|
||||
ggml_tensor * g_cumsum_t = ggml_cont(ctx0, ggml_transpose(ctx0, g_cumsum));
|
||||
ggml_tensor * gexp = ggml_exp(ctx0, g_cumsum_t);
|
||||
|
||||
ggml_tensor * kbeta_gexp = ggml_mul(ctx0, k_beta, gexp);
|
||||
cb(kbeta_gexp, "kbeta_gexp", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs)
|
||||
|
||||
ggml_tensor * k_cumdecay =
|
||||
ggml_cont(ctx0, ggml_transpose(ctx0, ggml_mul_mat(ctx0, attn, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gexp)))));
|
||||
cb(k_cumdecay, "k_cumdecay", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
|
||||
|
||||
ggml_tensor * attn_kq = ggml_mul_mat(ctx0, k, q);
|
||||
attn_kq = ggml_mul(ctx0, attn_kq, decay_mask);
|
||||
attn_kq = ggml_mul(ctx0, attn_kq, diag_mask);
|
||||
cb(attn_kq, "attn_kq", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
|
||||
|
||||
|
||||
// vectorized calculation of key_gdiff
|
||||
// improved from the chunked version:
|
||||
// g_last = torch.clamp(g_cum[:, :, -1], max=50.0).exp().unsqueeze(-1).unsqueeze(-1)
|
||||
// g_diff = torch.clamp(g_cum[:, :, -1:] - g_cum, max=50.0).exp()
|
||||
// key_gdiff = key * g_diff.unsqueeze(-1)
|
||||
// kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
|
||||
// last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
|
||||
|
||||
// get last element in g_cumsum along chunk_size dimension (ne0)
|
||||
// example: [[x, y, z, ..., last], ...] -> [[last], ...]
|
||||
ggml_tensor * g_last = ggml_view_4d(ctx0, g_cumsum, 1, 1, g_cumsum->ne[2], g_cumsum->ne[3],
|
||||
g_cumsum->nb[1], g_cumsum->nb[2], g_cumsum->nb[3],
|
||||
(g_cumsum->ne[0] - 1) * ggml_element_size(g_cumsum));
|
||||
g_last = ggml_cont(ctx0, g_last);
|
||||
cb(g_last, "g_last", il); // shape: (1, 1, n_chunks, H_v * n_seqs)
|
||||
|
||||
ggml_tensor * g_last_exp = ggml_exp(ctx0, g_last);
|
||||
cb(g_last_exp, "g_last_exp", il); // shape: (1, 1, n_chunks, H_v * n_seqs)
|
||||
|
||||
ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cumsum, g_last));
|
||||
cb(g_diff, "g_diff", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
|
||||
|
||||
ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff);
|
||||
ggml_tensor * g_diff_exp_t = ggml_reshape_4d(ctx0, g_diff_exp,
|
||||
1, chunk_size, n_chunks, g_diff_exp->ne[3]);
|
||||
|
||||
ggml_tensor * key_gdiff = ggml_mul(ctx0, k, g_diff_exp_t);
|
||||
cb(key_gdiff, "key_gdiff", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs)
|
||||
|
||||
ggml_tensor * key_gdiff_t = ggml_cont(ctx0, ggml_transpose(ctx0, key_gdiff));
|
||||
cb(key_gdiff_t, "key_gdiff_t", il); // shape: (chunk_size, S_k, n_chunks, H_v * n_seqs)
|
||||
|
||||
|
||||
// state to be updated per chunk
|
||||
ggml_tensor * new_state = state; // ggml_dup(ctx0, state);
|
||||
cb(new_state, "new_state", il); // shape: (S_v, S_v, H_v, n_seqs)
|
||||
|
||||
// shape after loop of chunks: (S_v, chunk_size, n_chunks, H_v * n_seqs)
|
||||
ggml_tensor * core_attn_out = nullptr;
|
||||
|
||||
for (int64_t chunk = 0; chunk < n_chunks; chunk++) {
|
||||
// shape: (S_k, chunk_size, 1, H_k * n_seqs)
|
||||
ggml_tensor * q_chunk = get_slice_2d(ctx0, q, chunk); // (no cont), next op: ggml_mul
|
||||
|
||||
// shape: (S_v, chunk_size, 1, H_v * n_seqs)
|
||||
ggml_tensor * v_chunk = get_slice_2d(ctx0, v, chunk); // (no cont), next op: ggml_repeat
|
||||
|
||||
// shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
|
||||
ggml_tensor * gexp_chunk = get_slice_2d(ctx0, gexp, chunk); // (no cont), next op: ggml_mul
|
||||
|
||||
// shape: (chunk_size, 1, H_v * n_seqs)
|
||||
ggml_tensor * k_cumdecay_chunk = get_slice_2d(ctx0, k_cumdecay, chunk); // (no cont), next op: ggml_mul_mat
|
||||
|
||||
// attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
|
||||
// replaced by precomputed attn_kq
|
||||
ggml_tensor * attn_chunk = get_slice_2d(ctx0, attn_kq, chunk);
|
||||
cb(attn_chunk, "attn_chunk", il);
|
||||
|
||||
ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs);
|
||||
|
||||
// v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state
|
||||
ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay_chunk);
|
||||
cb(v_prime, "v_prime_chunk", il); // shape: (S_v, 1, H_v * n_seqs)
|
||||
|
||||
// v_new = v_i - v_prime
|
||||
ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, v_chunk, v_prime), v_prime);
|
||||
ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new));
|
||||
cb(v_new, "v_new_chunk", il);
|
||||
|
||||
// attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
|
||||
ggml_tensor * q_g_exp = ggml_mul(ctx0, q_chunk, gexp_chunk);
|
||||
ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_g_exp);
|
||||
cb(attn_inter, "attn_inter_chunk", il);
|
||||
|
||||
// core_attn_out[:, :, i] = attn_inter + attn @ v_new
|
||||
ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, attn_chunk);
|
||||
cb(v_attn, "v_attn_chunk", il);
|
||||
|
||||
ggml_tensor * core_attn_out_chunk = ggml_add(ctx0, attn_inter, v_attn);
|
||||
cb(core_attn_out_chunk, "core_attn_out_chunk", il); // shape: (S_v, chunk_size, 1, H_v * n_seqs)
|
||||
|
||||
core_attn_out = core_attn_out == nullptr
|
||||
? core_attn_out_chunk
|
||||
: ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 2);
|
||||
|
||||
// kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
|
||||
ggml_tensor * k_gdiff_t = get_slice_2d(ctx0, key_gdiff_t, chunk);
|
||||
//ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, k_gdiff, v_new); // this is slower on metal, why?
|
||||
ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, k_gdiff_t);
|
||||
|
||||
// last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
|
||||
ggml_tensor * gexp_last_chunk = ggml_cont(ctx0, get_slice_2d(ctx0, g_last_exp, chunk));
|
||||
new_state = ggml_add(ctx0,
|
||||
ggml_mul(ctx0, new_state, ggml_reshape_4d(ctx0, gexp_last_chunk, gexp_last_chunk->ne[0], gexp_last_chunk->ne[1], H_v, n_seqs)),
|
||||
ggml_reshape_4d(ctx0, kgdmulvnew, kgdmulvnew->ne[0], kgdmulvnew->ne[1], H_v, n_seqs));
|
||||
}
|
||||
|
||||
// truncate padded tokens
|
||||
ggml_tensor * output_tokens = ggml_view_4d(ctx0, core_attn_out,
|
||||
S_v, n_tokens, H_v, n_seqs,
|
||||
ggml_row_size(core_attn_out->type, S_v),
|
||||
ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks),
|
||||
ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks * H_v), 0);
|
||||
output_tokens = ggml_cont(ctx0, output_tokens);
|
||||
cb(output_tokens, "output_tokens", il);
|
||||
|
||||
// permute back to (S_v, H_v, n_tokens, n_seqs)
|
||||
output_tokens = ggml_permute(ctx0, output_tokens, 0, 2, 1, 3);
|
||||
output_tokens = ggml_cont(ctx0, output_tokens);
|
||||
|
||||
return {output_tokens, new_state};
|
||||
}
|
||||
|
||||
std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen35moe::build_delta_net_autoregressive(
|
||||
ggml_tensor * q,
|
||||
ggml_tensor * k,
|
||||
ggml_tensor * v,
|
||||
ggml_tensor * g,
|
||||
ggml_tensor * beta,
|
||||
ggml_tensor * state,
|
||||
int il) {
|
||||
const int64_t S_k = q->ne[0];
|
||||
const int64_t H_k = q->ne[1];
|
||||
const int64_t n_tokens = q->ne[2];
|
||||
const int64_t n_seqs = q->ne[3];
|
||||
|
||||
const int64_t S_v = v->ne[0];
|
||||
const int64_t H_v = v->ne[1];
|
||||
|
||||
GGML_ASSERT(n_tokens == 1); // This function is optimized for single token processing
|
||||
GGML_ASSERT(v->ne[2] == n_tokens);
|
||||
GGML_ASSERT(k->ne[2] == n_tokens);
|
||||
GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
|
||||
GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
|
||||
GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v * H_v && state->ne[2] == 1 && state->ne[3] == n_seqs);
|
||||
|
||||
GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
|
||||
GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
|
||||
|
||||
GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
|
||||
|
||||
const float eps_norm = hparams.f_norm_rms_eps;
|
||||
|
||||
q = ggml_l2_norm(ctx0, q, eps_norm);
|
||||
k = ggml_l2_norm(ctx0, k, eps_norm);
|
||||
|
||||
const float scale = 1.0f / sqrtf(S_v);
|
||||
|
||||
q = ggml_scale(ctx0, q, scale);
|
||||
beta = ggml_sigmoid(ctx0, beta);
|
||||
|
||||
cb(q, "q_in", il);
|
||||
cb(k, "k_in", il);
|
||||
cb(v, "v_in", il);
|
||||
cb(beta, "beta_in", il);
|
||||
cb(g, "g_in", il);
|
||||
|
||||
state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
|
||||
|
||||
ggml_tensor * g_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, g), 1, 1, H_k, n_seqs);
|
||||
ggml_tensor * beta_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, beta), 1, 1, H_k, n_seqs);
|
||||
|
||||
// Apply exponential to g_t
|
||||
g_t = ggml_exp(ctx0, g_t);
|
||||
|
||||
// Apply the gated delta rule for the single timestep
|
||||
// last_recurrent_state = last_recurrent_state * g_t
|
||||
state = ggml_mul(ctx0, state, g_t);
|
||||
|
||||
// kv_mem = (last_recurrent_state * k_t.unsqueeze(-1)).sum(dim=-2)
|
||||
ggml_tensor * k_t_unsqueezed = ggml_reshape_4d(ctx0, k, 1, S_v, H_v, n_seqs);
|
||||
ggml_tensor * kv_mem = ggml_mul(ctx0, state, k_t_unsqueezed);
|
||||
// we need to sum over dim=-2, so we transpose, sum, then transpose again
|
||||
kv_mem = ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, kv_mem))));
|
||||
|
||||
// v_t = v.unsqueeze(2) (we insert the singleton dimension after n_seqs and H_v)
|
||||
ggml_tensor * v_t = ggml_reshape_4d(ctx0, v, S_v, 1, H_v, n_seqs);
|
||||
// delta = (v_t - kv_mem) * beta_t
|
||||
ggml_tensor * v_diff = ggml_sub(ctx0, v_t, kv_mem); // both should be [S_v, 1, H_v, n_seqs]
|
||||
ggml_tensor * delta = ggml_mul(ctx0, v_diff, beta_t);
|
||||
|
||||
// last_recurrent_state = last_recurrent_state + k_t.unsqueeze(-1) * delta
|
||||
ggml_tensor * k_t_delta = ggml_mul(ctx0, ggml_repeat_4d(ctx0, k_t_unsqueezed, S_v, S_v, H_v, n_seqs), delta);
|
||||
state = ggml_add(ctx0, state, k_t_delta);
|
||||
|
||||
// Compute the attention output
|
||||
// core_attn_out = (last_recurrent_state * q_t.unsqueeze(-1)).sum(dim=-2)
|
||||
ggml_tensor * q_t_unsqueezed = ggml_reshape_4d(ctx0, q, 1, S_v, H_v, n_seqs); // unsqueeze q_t
|
||||
ggml_tensor * state_q = ggml_mul(ctx0, state, q_t_unsqueezed);
|
||||
// again, since it's over dim = -2, transpose, sum, transpose back
|
||||
ggml_tensor * core_attn_out =
|
||||
ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, state_q))));
|
||||
|
||||
// core_attn_out should be [S_v, 1, H_v, n_seqs] after this
|
||||
cb(core_attn_out, "output_tokens", il);
|
||||
cb(state, "new_state", il);
|
||||
|
||||
return {core_attn_out, state};
|
||||
}
|
||||
|
||||
std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen35moe::build_qkvz(
|
||||
ggml_tensor * input,
|
||||
int il) {
|
||||
@@ -562,9 +193,6 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn(
|
||||
ggml_tensor * llm_build_qwen35moe ::build_layer_attn_linear(
|
||||
llm_graph_input_rs * inp,
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * causal_mask,
|
||||
ggml_tensor * identity,
|
||||
ggml_tensor * diag_mask,
|
||||
int il) {
|
||||
const auto * mctx_cur = inp->mctx;
|
||||
|
||||
@@ -588,8 +216,11 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn_linear(
|
||||
ggml_tensor * z = qkvz.second;
|
||||
|
||||
ggml_tensor * beta = build_lora_mm(model.layers[il].ssm_beta, cur);
|
||||
beta = ggml_reshape_4d(ctx0, beta, num_v_heads, 1, n_seq_tokens, n_seqs);
|
||||
beta = ggml_reshape_4d(ctx0, beta, 1, num_v_heads, n_seq_tokens, n_seqs);
|
||||
cb(beta, "beta", il);
|
||||
|
||||
beta = ggml_sigmoid(ctx0, beta);
|
||||
|
||||
ggml_tensor * alpha = build_lora_mm(model.layers[il].ssm_alpha, cur);
|
||||
alpha = ggml_cont_3d(ctx0, alpha, num_v_heads, n_seq_tokens, n_seqs);
|
||||
cb(alpha, "alpha", il);
|
||||
@@ -597,15 +228,16 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn_linear(
|
||||
ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt);
|
||||
ggml_tensor * alpha_softplus = ggml_softplus(ctx0, alpha_biased);
|
||||
cb(alpha_softplus, "a_softplus", il);
|
||||
|
||||
ggml_tensor * gate = ggml_mul(ctx0, alpha_softplus, model.layers[il].ssm_a); // -A_log.exp() * softplus
|
||||
cb(gate, "gate", il);
|
||||
|
||||
gate = ggml_reshape_4d(ctx0, gate, 1, num_v_heads, n_seq_tokens, n_seqs);
|
||||
|
||||
// Get convolution states from cache
|
||||
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
|
||||
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
|
||||
|
||||
// bool use_precomputed_states = n_seq_tokens == 1 && mctx_cur->has_previous_state();
|
||||
|
||||
// Build the convolution states tensor
|
||||
ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
|
||||
cb(conv_states, "conv_states", il);
|
||||
@@ -614,11 +246,12 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn_linear(
|
||||
ggml_tensor * conv_kernel = model.layers[il].ssm_conv1d;
|
||||
const int64_t conv_kernel_size = conv_kernel->ne[0];
|
||||
const int64_t conv_channels = d_inner + 2 * hparams.ssm_n_group * hparams.ssm_d_state;
|
||||
conv_states = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, conv_channels, n_seqs);
|
||||
|
||||
conv_states = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, conv_channels, n_seqs);
|
||||
cb(conv_states, "conv_states_reshaped", il);
|
||||
|
||||
qkv_mixed = ggml_permute(ctx0, qkv_mixed, 1, 0, 2, 3);
|
||||
cb(qkv_mixed, "qkv_mixed_permuted", il);
|
||||
qkv_mixed = ggml_transpose(ctx0, qkv_mixed);
|
||||
cb(qkv_mixed, "qkv_mixed_transposed", il);
|
||||
|
||||
ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0);
|
||||
cb(conv_input, "conv_input", il);
|
||||
@@ -638,7 +271,10 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn_linear(
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, state_update_target));
|
||||
cb(conv_states_all, "conv_states_updated", il);
|
||||
|
||||
// Apply SSM convolution
|
||||
ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs);
|
||||
state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim, num_v_heads, n_seqs);
|
||||
cb(state, "state_predelta", il);
|
||||
|
||||
ggml_tensor * conv_output_proper = ggml_ssm_conv(ctx0, conv_input, conv_kernel);
|
||||
cb(conv_output_proper, "conv_output_raw", il);
|
||||
|
||||
@@ -652,31 +288,41 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn_linear(
|
||||
int64_t nb1_qkv = ggml_row_size(conv_qkv_mix->type, qkv_dim);
|
||||
|
||||
// Extract the convolved Q, K, V from conv_output
|
||||
ggml_tensor * q_conv =
|
||||
ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, nb1_qkv, 0);
|
||||
ggml_tensor * q_conv = ggml_view_4d(ctx0, conv_qkv_mix, head_k_dim, num_k_heads, n_seq_tokens, n_seqs,
|
||||
ggml_row_size(conv_qkv_mix->type, head_k_dim),
|
||||
nb1_qkv,
|
||||
nb1_qkv * n_seq_tokens,
|
||||
0);
|
||||
|
||||
ggml_tensor * k_conv = ggml_view_4d(ctx0, conv_qkv_mix, head_k_dim, num_k_heads, n_seq_tokens, n_seqs,
|
||||
ggml_row_size(conv_qkv_mix->type, head_k_dim),
|
||||
nb1_qkv,
|
||||
nb1_qkv * n_seq_tokens,
|
||||
head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
|
||||
|
||||
ggml_tensor * v_conv = ggml_view_4d(ctx0, conv_qkv_mix, head_v_dim, num_v_heads, n_seq_tokens, n_seqs,
|
||||
ggml_row_size(conv_qkv_mix->type, head_v_dim),
|
||||
nb1_qkv,
|
||||
nb1_qkv * n_seq_tokens,
|
||||
ggml_row_size(conv_qkv_mix->type, 2 * head_k_dim * num_k_heads));
|
||||
|
||||
cb(q_conv, "q_conv", il);
|
||||
ggml_tensor * k_conv =
|
||||
ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, nb1_qkv,
|
||||
head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
|
||||
cb(k_conv, "k_conv", il);
|
||||
ggml_tensor * v_conv =
|
||||
ggml_view_2d(ctx0, conv_qkv_mix, head_v_dim * num_v_heads, n_seq_tokens * n_seqs, nb1_qkv,
|
||||
2 * head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
|
||||
cb(v_conv, "v_conv", il);
|
||||
|
||||
// Unsqueeze them
|
||||
q_conv = ggml_cont_4d(ctx0, q_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
|
||||
k_conv = ggml_cont_4d(ctx0, k_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
|
||||
v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
|
||||
const float eps_norm = hparams.f_norm_rms_eps;
|
||||
|
||||
ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs);
|
||||
state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim * num_v_heads, 1, n_seqs);
|
||||
cb(state, "state_predelta", il);
|
||||
q_conv = ggml_l2_norm(ctx0, q_conv, eps_norm);
|
||||
k_conv = ggml_l2_norm(ctx0, k_conv, eps_norm);
|
||||
|
||||
// if head keys and value keys are different, repeat Q/K to match V's head count
|
||||
// V heads are in tiled order (from conversion), so simple tiled repeat works
|
||||
//q_conv = ggml_cont_4d(ctx0, q_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
|
||||
//k_conv = ggml_cont_4d(ctx0, k_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
|
||||
//v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
|
||||
|
||||
// if head keys and value keys are different, repeat to force tensors into matching shapes
|
||||
if (num_k_heads != num_v_heads) {
|
||||
GGML_ASSERT(num_v_heads % num_k_heads == 0);
|
||||
// TODO: try to avoid these explicit repeats by utilizing op broadcast
|
||||
q_conv = ggml_repeat_4d(ctx0, q_conv, head_k_dim, num_v_heads, n_seq_tokens, n_seqs);
|
||||
k_conv = ggml_repeat_4d(ctx0, k_conv, head_k_dim, num_v_heads, n_seq_tokens, n_seqs);
|
||||
}
|
||||
@@ -690,7 +336,7 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn_linear(
|
||||
if (n_seq_tokens == 1) {
|
||||
attn_out = build_delta_net_autoregressive(q_conv, k_conv, v_conv, gate, beta, state, il);
|
||||
} else {
|
||||
attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, diag_mask, il);
|
||||
attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, il);
|
||||
}
|
||||
ggml_tensor * output = attn_out.first;
|
||||
ggml_tensor * new_state = attn_out.second;
|
||||
@@ -699,19 +345,15 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn_linear(
|
||||
|
||||
// Update the recurrent states
|
||||
ggml_build_forward_expand(gf,
|
||||
ggml_cpy(ctx0, new_state,
|
||||
ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs,
|
||||
kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all))));
|
||||
|
||||
// Reshape both attn_out_final and z to 2D tensors for normalization
|
||||
// attn_out_final: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]
|
||||
ggml_tensor * attn_out_2d_final = ggml_reshape_2d(ctx0, output, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);
|
||||
ggml_cpy(ctx0, new_state,
|
||||
ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs,
|
||||
kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all))));
|
||||
|
||||
// z: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]
|
||||
ggml_tensor * z_2d = ggml_reshape_2d(ctx0, z, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);
|
||||
ggml_tensor * z_2d = ggml_reshape_4d(ctx0, z, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
|
||||
|
||||
// Apply gated normalization: self.norm(core_attn_out, z)
|
||||
ggml_tensor * attn_out_norm = build_norm_gated(attn_out_2d_final, model.layers[il].ssm_norm, z_2d, il);
|
||||
ggml_tensor * attn_out_norm = build_norm_gated(output, model.layers[il].ssm_norm, z_2d, il);
|
||||
|
||||
// Final reshape: [head_dim, n_heads, n_tokens, n_seqs] -> [n_tokens, n_seqs, n_heads * head_dim]
|
||||
ggml_tensor * final_output = ggml_reshape_3d(ctx0, attn_out_norm, head_v_dim * num_v_heads, n_seq_tokens, n_seqs);
|
||||
@@ -722,7 +364,8 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn_linear(
|
||||
cb(cur, "linear_attn_out", il);
|
||||
|
||||
// Reshape back to original dimensions
|
||||
cur = ggml_cont_2d(ctx0, cur, n_embd, n_seq_tokens * n_seqs);
|
||||
cur = ggml_reshape_2d(ctx0, cur, n_embd, n_seq_tokens * n_seqs);
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
|
||||
@@ -306,8 +306,6 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
|
||||
|
||||
ggml_tensor * beta = ggml_sigmoid(ctx0, b);
|
||||
|
||||
beta = ggml_reshape_4d(ctx0, beta, num_v_heads, 1, n_seq_tokens, n_seqs);
|
||||
|
||||
// Reshape a to merge head dimensions: [batch, seq_len, num_k_heads, num_v_heads/num_k_heads] -> [batch, seq_len, num_v_heads]
|
||||
ggml_tensor * alpha = ggml_cont_3d(ctx0, a, num_v_heads, n_seq_tokens, n_seqs);
|
||||
|
||||
@@ -318,6 +316,9 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
|
||||
ggml_tensor * gate = ggml_mul(ctx0, alpha_softplus, model.layers[il].ssm_a); // -A_log.exp() * softplus
|
||||
cb(gate, "gate", il);
|
||||
|
||||
beta = ggml_reshape_4d(ctx0, beta, 1, num_v_heads, n_seq_tokens, n_seqs);
|
||||
gate = ggml_reshape_4d(ctx0, gate, 1, num_v_heads, n_seq_tokens, n_seqs);
|
||||
|
||||
// Get convolution states from cache
|
||||
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
|
||||
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
|
||||
|
||||
@@ -7791,6 +7791,10 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 67, {1, 1}, {4, 1}, {0, 2, 1, 3}));
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32, GGML_TYPE_F32, 16, 32, 32, { 1, 1}, {1, 1}, {0, 1, 2, 3}, 64, 3));
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32, GGML_TYPE_F32, 64, 77, 77, {12,1}, {1,1}));
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 2, 1, 3, {128, 1024}, {1, 1}));
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 2, 3, 4, {128, 1024}, {1, 1}));
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 2, 1, 3, {128*1024, 1}, {1, 1}, {0, 2, 1, 3}));
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 2, 1, 3, {128*1024, 1}, {1, 1}, {0, 1, 2, 3}, 64));
|
||||
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_0, GGML_TYPE_F32, 576, 512, 576, {1,1}, {1,1}));
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_0, GGML_TYPE_F32, 1, 2048, 8192, {1, 1}, {1, 1}));
|
||||
|
||||
@@ -229,6 +229,20 @@ common_chat_tool python_tool {
|
||||
"required": ["code"]
|
||||
})",
|
||||
};
|
||||
common_chat_tool todo_list_tool {
|
||||
/* .name = */ "todo_list",
|
||||
/* .description = */ "Create or update the todo list",
|
||||
/* .parameters = */ R"({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"todos": {
|
||||
"type": "array",
|
||||
"description": "List of TODO list items"
|
||||
}
|
||||
},
|
||||
"required": ["todos"]
|
||||
})",
|
||||
};
|
||||
common_chat_tool code_interpreter_tool {
|
||||
/* .name = */ "code_interpreter",
|
||||
/* .description = */ "an ipython interpreter",
|
||||
@@ -3018,540 +3032,26 @@ Hey there!<|im_end|>
|
||||
);
|
||||
}
|
||||
|
||||
// Test Qwen3-Coder XML format
|
||||
{
|
||||
// Basic XML tool call parsing
|
||||
assert_msg_equals(
|
||||
message_assist_call,
|
||||
test_chat_parse(
|
||||
"<tool_call>\n"
|
||||
" <function=special_function>\n"
|
||||
" <parameter=arg1>\n"
|
||||
" 1\n"
|
||||
" </parameter>\n"
|
||||
" </function>\n"
|
||||
"</tool_call>",
|
||||
/* is_partial= */ false,
|
||||
{COMMON_CHAT_FORMAT_QWEN3_CODER_XML}));
|
||||
// Step-3.5-Flash template: uses same XML output format as Qwen3-Coder and Nemotron v3,
|
||||
// but with <think> support. Routes to the Nemotron v3 PEG parser for streaming and
|
||||
// schema-aware parameter parsing.
|
||||
auto tmpls = read_templates("models/templates/stepfun-ai-Step-3.5-Flash.jinja");
|
||||
assert_equals(COMMON_CHAT_FORMAT_PEG_CONSTRUCTED, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
|
||||
|
||||
// Multiple parameters with different types
|
||||
common_chat_msg expected_multi_param;
|
||||
expected_multi_param.role = "assistant";
|
||||
expected_multi_param.tool_calls = {
|
||||
{ "complex_function", "{\"name\":\"John Doe\",\"age\":30,\"active\":true,\"score\":95.5}", "" }
|
||||
};
|
||||
|
||||
test_parser_with_streaming(expected_multi_param,
|
||||
"<tool_call>\n"
|
||||
" <function=complex_function>\n"
|
||||
" <parameter=name>\n"
|
||||
" John Doe\n"
|
||||
" </parameter>\n"
|
||||
" <parameter=age>\n"
|
||||
" 30\n"
|
||||
" </parameter>\n"
|
||||
" <parameter=active>\n"
|
||||
" true\n"
|
||||
" </parameter>\n"
|
||||
" <parameter=score>\n"
|
||||
" 95.5\n"
|
||||
" </parameter>\n"
|
||||
" </function>\n"
|
||||
"</tool_call>",
|
||||
[&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
|
||||
|
||||
// Special characters and Unicode
|
||||
common_chat_msg expected_special_chars;
|
||||
expected_special_chars.role = "assistant";
|
||||
expected_special_chars.tool_calls = {
|
||||
{ "unicode_function", "{\"message\":\"Hello 世界! 🌍 Special chars: @#$%^&*()\"}", "" }
|
||||
};
|
||||
|
||||
test_parser_with_streaming(expected_special_chars,
|
||||
"<tool_call>\n"
|
||||
" <function=unicode_function>\n"
|
||||
" <parameter=message>\n"
|
||||
" Hello 世界! 🌍 Special chars: @#$%^&*()\n"
|
||||
" </parameter>\n"
|
||||
" </function>\n"
|
||||
"</tool_call>",
|
||||
[&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
|
||||
|
||||
// Multiline content with newlines and indentation
|
||||
common_chat_msg expected_multiline;
|
||||
expected_multiline.role = "assistant";
|
||||
expected_multiline.tool_calls = {
|
||||
{ "code_function", "{\"code\":\"def hello():\\n print(\\\"Hello, World!\\\")\\n return True\"}", "" }
|
||||
};
|
||||
|
||||
test_parser_with_streaming(expected_multiline,
|
||||
"<tool_call>\n"
|
||||
" <function=code_function>\n"
|
||||
" <parameter=code>\n"
|
||||
"def hello():\n"
|
||||
" print(\"Hello, World!\")\n"
|
||||
" return True\n"
|
||||
" </parameter>\n"
|
||||
" </function>\n"
|
||||
"</tool_call>",
|
||||
[&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
|
||||
|
||||
// JSON object as parameter value
|
||||
common_chat_msg expected_json_param;
|
||||
expected_json_param.role = "assistant";
|
||||
expected_json_param.tool_calls = {
|
||||
{ "json_function", "{\"config\":{\"host\":\"localhost\",\"port\":8080,\"ssl\":false}}", "" }
|
||||
};
|
||||
|
||||
test_parser_with_streaming(
|
||||
expected_json_param,
|
||||
"<tool_call>\n"
|
||||
" <function=json_function>\n"
|
||||
" <parameter=config>\n"
|
||||
" {\"host\": \"localhost\", \"port\": 8080, \"ssl\": false}\n"
|
||||
" </parameter>\n"
|
||||
" </function>\n"
|
||||
"</tool_call>",
|
||||
[&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
|
||||
|
||||
// Array as parameter value
|
||||
common_chat_msg expected_array_param;
|
||||
expected_array_param.role = "assistant";
|
||||
expected_array_param.tool_calls = {
|
||||
{ "array_function", "{\"items\":[\"apple\",\"banana\",\"cherry\"]}", "" }
|
||||
};
|
||||
|
||||
test_parser_with_streaming(
|
||||
expected_array_param,
|
||||
"<tool_call>\n"
|
||||
" <function=array_function>\n"
|
||||
" <parameter=items>\n"
|
||||
" [\"apple\", \"banana\", \"cherry\"]\n"
|
||||
" </parameter>\n"
|
||||
" </function>\n"
|
||||
"</tool_call>",
|
||||
[&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
|
||||
|
||||
// Empty parameter
|
||||
common_chat_msg expected_empty_param;
|
||||
expected_empty_param.role = "assistant";
|
||||
expected_empty_param.tool_calls = {
|
||||
{ "empty_function", "{\"empty_param\":\"\"}", "" }
|
||||
};
|
||||
|
||||
test_parser_with_streaming(
|
||||
expected_empty_param,
|
||||
"<tool_call>\n"
|
||||
" <function=empty_function>\n"
|
||||
" <parameter=empty_param>\n"
|
||||
" </parameter>\n"
|
||||
" </function>\n"
|
||||
"</tool_call>",
|
||||
[&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
|
||||
|
||||
// Boolean values (true/false)
|
||||
common_chat_msg expected_boolean;
|
||||
expected_boolean.role = "assistant";
|
||||
expected_boolean.tool_calls = {
|
||||
{ "boolean_function", "{\"enabled\":true,\"debug\":false}", "" }
|
||||
};
|
||||
|
||||
test_parser_with_streaming(
|
||||
expected_boolean,
|
||||
"<tool_call>\n"
|
||||
" <function=boolean_function>\n"
|
||||
" <parameter=enabled>\n"
|
||||
" true\n"
|
||||
" </parameter>\n"
|
||||
" <parameter=debug>\n"
|
||||
" false\n"
|
||||
" </parameter>\n"
|
||||
" </function>\n"
|
||||
"</tool_call>",
|
||||
[&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
|
||||
|
||||
// Null value
|
||||
common_chat_msg expected_null;
|
||||
expected_null.role = "assistant";
|
||||
expected_null.tool_calls = {
|
||||
{ "null_function", "{\"optional_param\":null}", "" }
|
||||
};
|
||||
|
||||
test_parser_with_streaming(
|
||||
expected_null,
|
||||
"<tool_call>\n"
|
||||
" <function=null_function>\n"
|
||||
" <parameter=optional_param>\n"
|
||||
" null\n"
|
||||
" </parameter>\n"
|
||||
" </function>\n"
|
||||
"</tool_call>",
|
||||
[&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
|
||||
|
||||
// Negative numbers and scientific notation
|
||||
common_chat_msg expected_numbers;
|
||||
expected_numbers.role = "assistant";
|
||||
expected_numbers.tool_calls = {
|
||||
{ "math_function", "{\"negative\":-42,\"decimal\":-3.14,\"scientific\":1.23e-4}", "" }
|
||||
};
|
||||
|
||||
test_parser_with_streaming(
|
||||
expected_numbers,
|
||||
"<tool_call>\n"
|
||||
" <function=math_function>\n"
|
||||
" <parameter=negative>\n"
|
||||
" -42\n"
|
||||
" </parameter>\n"
|
||||
" <parameter=decimal>\n"
|
||||
" -3.14\n"
|
||||
" </parameter>\n"
|
||||
" <parameter=scientific>\n"
|
||||
" 1.23e-4\n"
|
||||
" </parameter>\n"
|
||||
" </function>\n"
|
||||
"</tool_call>",
|
||||
[&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
|
||||
|
||||
// XML-like content in parameters (should be escaped)
|
||||
common_chat_msg expected_xml_content;
|
||||
expected_xml_content.role = "assistant";
|
||||
expected_xml_content.tool_calls = {
|
||||
{ "xml_function", "{\"xml_content\":\"<root><item>value</item></root>\"}", "" }
|
||||
};
|
||||
|
||||
test_parser_with_streaming(
|
||||
expected_xml_content,
|
||||
"<tool_call>\n"
|
||||
" <function=xml_function>\n"
|
||||
" <parameter=xml_content>\n"
|
||||
" <root><item>value</item></root>\n"
|
||||
" </parameter>\n"
|
||||
" </function>\n"
|
||||
"</tool_call>",
|
||||
[&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
|
||||
|
||||
// Quotes and escape characters
|
||||
common_chat_msg expected_quotes;
|
||||
expected_quotes.role = "assistant";
|
||||
expected_quotes.tool_calls = {
|
||||
{ "quote_function", "{\"message\":\"She said \\\"Hello!\\\" and left.\"}", "" }
|
||||
};
|
||||
|
||||
test_parser_with_streaming(
|
||||
expected_quotes,
|
||||
"<tool_call>\n"
|
||||
" <function=quote_function>\n"
|
||||
" <parameter=message>\n"
|
||||
" She said \"Hello!\" and left.\n"
|
||||
" </parameter>\n"
|
||||
" </function>\n"
|
||||
"</tool_call>",
|
||||
[&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
|
||||
|
||||
// Long parameter value (simplified)
|
||||
std::string long_text = "This is a long text parameter that should test the parser's ability to handle larger amounts of text data.";
|
||||
|
||||
common_chat_msg expected_long_text;
|
||||
expected_long_text.role = "assistant";
|
||||
expected_long_text.tool_calls = {
|
||||
{ "long_function", "{\"long_text\":\"" + long_text + "\"}", "" }
|
||||
};
|
||||
|
||||
test_parser_with_streaming(
|
||||
expected_long_text,
|
||||
"<tool_call>\n"
|
||||
" <function=long_function>\n"
|
||||
" <parameter=long_text>\n"
|
||||
" " + long_text + "\n"
|
||||
" </parameter>\n"
|
||||
" </function>\n"
|
||||
"</tool_call>",
|
||||
[&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
|
||||
|
||||
// Mixed content with text before and after tool call
|
||||
common_chat_msg expected_mixed_content;
|
||||
expected_mixed_content.role = "assistant";
|
||||
expected_mixed_content.content = "I'll help you search for products. ";
|
||||
expected_mixed_content.tool_calls = {
|
||||
{ "search_function", "{\"query\":\"laptops\"}", "" }
|
||||
};
|
||||
|
||||
test_parser_with_streaming(
|
||||
expected_mixed_content,
|
||||
"I'll help you search for products. <tool_call>\n"
|
||||
" <function=search_function>\n"
|
||||
" <parameter=query>\n"
|
||||
" laptops\n"
|
||||
" </parameter>\n"
|
||||
" </function>\n"
|
||||
"</tool_call>",
|
||||
[&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
|
||||
|
||||
// Compact format (no extra whitespace)
|
||||
common_chat_msg expected_compact;
|
||||
expected_compact.role = "assistant";
|
||||
expected_compact.tool_calls = {
|
||||
{ "compact_function", "{\"param\":\"value\"}", "" }
|
||||
};
|
||||
|
||||
test_parser_with_streaming(
|
||||
expected_compact,
|
||||
"<tool_call><function=compact_function><parameter=param>value</parameter></function></tool_call>",
|
||||
[&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
|
||||
|
||||
// Function name with underscores and numbers
|
||||
common_chat_msg expected_complex_name;
|
||||
expected_complex_name.role = "assistant";
|
||||
expected_complex_name.tool_calls = {
|
||||
{ "get_user_data_v2", "{\"user_id\":12345}", "" }
|
||||
};
|
||||
|
||||
test_parser_with_streaming(
|
||||
expected_complex_name,
|
||||
"<tool_call>\n"
|
||||
" <function=get_user_data_v2>\n"
|
||||
" <parameter=user_id>\n"
|
||||
" 12345\n"
|
||||
" </parameter>\n"
|
||||
" </function>\n"
|
||||
"</tool_call>",
|
||||
[&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
|
||||
|
||||
// Parameter names with underscores and numbers
|
||||
common_chat_msg expected_complex_params;
|
||||
expected_complex_params.role = "assistant";
|
||||
expected_complex_params.tool_calls = {
|
||||
{ "test_function", "{\"param_1\":\"value1\",\"param_2_name\":\"value2\",\"param3\":123}", "" }
|
||||
};
|
||||
|
||||
test_parser_with_streaming(
|
||||
expected_complex_params,
|
||||
"<tool_call>\n"
|
||||
" <function=test_function>\n"
|
||||
" <parameter=param_1>\n"
|
||||
" value1\n"
|
||||
" </parameter>\n"
|
||||
" <parameter=param_2_name>\n"
|
||||
" value2\n"
|
||||
" </parameter>\n"
|
||||
" <parameter=param3>\n"
|
||||
" 123\n"
|
||||
" </parameter>\n"
|
||||
" </function>\n"
|
||||
"</tool_call>",
|
||||
[&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
|
||||
|
||||
// Very deeply nested XML content in parameter
|
||||
common_chat_msg expected_deep_xml;
|
||||
expected_deep_xml.role = "assistant";
|
||||
expected_deep_xml.tool_calls = {
|
||||
{ "xml_parser", "{\"xml\":\"<root><level1><level2><level3>deep content</level3></level2></level1></root>\"}", "" }
|
||||
};
|
||||
|
||||
test_parser_with_streaming(
|
||||
expected_deep_xml,
|
||||
"<tool_call>\n"
|
||||
" <function=xml_parser>\n"
|
||||
" <parameter=xml>\n"
|
||||
" <root><level1><level2><level3>deep content</level3></level2></level1></root>\n"
|
||||
" </parameter>\n"
|
||||
" </function>\n"
|
||||
"</tool_call>",
|
||||
[&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
|
||||
|
||||
// Parameter with only whitespace
|
||||
common_chat_msg expected_whitespace_param;
|
||||
expected_whitespace_param.role = "assistant";
|
||||
expected_whitespace_param.tool_calls = {
|
||||
{ "whitespace_function", "{\"spaces\":\"\"}", "" }
|
||||
};
|
||||
|
||||
test_parser_with_streaming(
|
||||
expected_whitespace_param,
|
||||
"<tool_call>\n"
|
||||
" <function=whitespace_function>\n"
|
||||
" <parameter=spaces>\n"
|
||||
" \n"
|
||||
" </parameter>\n"
|
||||
" </function>\n"
|
||||
"</tool_call>",
|
||||
[&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
|
||||
|
||||
// Parameter with tabs and mixed whitespace
|
||||
common_chat_msg expected_mixed_whitespace;
|
||||
expected_mixed_whitespace.role = "assistant";
|
||||
expected_mixed_whitespace.tool_calls = {
|
||||
{ "tab_function", "{\"content\":\"line1\\n\\tindented line\\n spaces\"}", "" }
|
||||
};
|
||||
|
||||
test_parser_with_streaming(
|
||||
expected_mixed_whitespace,
|
||||
"<tool_call>\n"
|
||||
" <function=tab_function>\n"
|
||||
" <parameter=content>\n"
|
||||
"line1\n"
|
||||
"\tindented line\n"
|
||||
" spaces\n"
|
||||
" </parameter>\n"
|
||||
" </function>\n"
|
||||
"</tool_call>",
|
||||
[&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
|
||||
|
||||
// Control characters and special Unicode
|
||||
common_chat_msg expected_control_chars;
|
||||
expected_control_chars.role = "assistant";
|
||||
expected_control_chars.tool_calls = {
|
||||
{ "control_function", "{\"text\":\"Line1\\nLine2\\tTabbed\\rCarriage return\"}", "" }
|
||||
};
|
||||
|
||||
test_parser_with_streaming(
|
||||
expected_control_chars,
|
||||
"<tool_call>\n"
|
||||
" <function=control_function>\n"
|
||||
" <parameter=text>\n"
|
||||
"Line1\nLine2\tTabbed\rCarriage return\n"
|
||||
" </parameter>\n"
|
||||
" </function>\n"
|
||||
"</tool_call>",
|
||||
[&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
|
||||
|
||||
// Emoji and extended Unicode characters
|
||||
common_chat_msg expected_emoji;
|
||||
expected_emoji.role = "assistant";
|
||||
expected_emoji.tool_calls = {
|
||||
{ "emoji_function", "{\"message\":\"Hello! 👋 🌟 🚀 Testing emojis: 😀😃😄😁 and symbols: ∑∏∆∇\"}", "" }
|
||||
};
|
||||
|
||||
test_parser_with_streaming(
|
||||
expected_emoji,
|
||||
"<tool_call>\n"
|
||||
" <function=emoji_function>\n"
|
||||
" <parameter=message>\n"
|
||||
" Hello! 👋 🌟 🚀 Testing emojis: 😀😃😄😁 and symbols: ∑∏∆∇\n"
|
||||
" </parameter>\n"
|
||||
" </function>\n"
|
||||
"</tool_call>",
|
||||
[&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
|
||||
|
||||
// Mathematical expressions and formulas
|
||||
common_chat_msg expected_math;
|
||||
expected_math.role = "assistant";
|
||||
expected_math.tool_calls = {
|
||||
{ "math_function", "{\"formula\":\"E = mc² and ∫f(x)dx = F(x) + C\"}", "" }
|
||||
};
|
||||
|
||||
test_parser_with_streaming(
|
||||
expected_math,
|
||||
"<tool_call>\n"
|
||||
" <function=math_function>\n"
|
||||
" <parameter=formula>\n"
|
||||
" E = mc² and ∫f(x)dx = F(x) + C\n"
|
||||
" </parameter>\n"
|
||||
" </function>\n"
|
||||
"</tool_call>",
|
||||
[&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
|
||||
|
||||
// SQL injection-like content (should be safely escaped)
|
||||
common_chat_msg expected_sql;
|
||||
expected_sql.role = "assistant";
|
||||
expected_sql.tool_calls = {
|
||||
{ "sql_function", "{\"query\":\"SELECT * FROM users WHERE id = 1; DROP TABLE users; --\"}", "" }
|
||||
};
|
||||
|
||||
test_parser_with_streaming(
|
||||
expected_sql,
|
||||
"<tool_call>\n"
|
||||
" <function=sql_function>\n"
|
||||
" <parameter=query>\n"
|
||||
" SELECT * FROM users WHERE id = 1; DROP TABLE users; --\n"
|
||||
" </parameter>\n"
|
||||
" </function>\n"
|
||||
"</tool_call>",
|
||||
[&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
|
||||
|
||||
// HTML/XML injection content
|
||||
common_chat_msg expected_html;
|
||||
expected_html.role = "assistant";
|
||||
expected_html.tool_calls = {
|
||||
{ "html_function", "{\"content\":\"<script>alert('xss')</script><img src=x onerror=alert(1)>\"}", "" }
|
||||
};
|
||||
|
||||
test_parser_with_streaming(
|
||||
expected_html,
|
||||
"<tool_call>\n"
|
||||
" <function=html_function>\n"
|
||||
" <parameter=content>\n"
|
||||
" <script>alert('xss')</script><img src=x onerror=alert(1)>\n"
|
||||
" </parameter>\n"
|
||||
" </function>\n"
|
||||
"</tool_call>",
|
||||
[&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
|
||||
|
||||
// Binary-like content (base64)
|
||||
common_chat_msg expected_binary;
|
||||
expected_binary.role = "assistant";
|
||||
expected_binary.tool_calls = {
|
||||
{ "binary_function", "{\"data\":\"SGVsbG8gV29ybGQhIFRoaXMgaXMgYmFzZTY0IGVuY29kZWQgdGV4dC4=\"}", "" }
|
||||
};
|
||||
|
||||
test_parser_with_streaming(
|
||||
expected_binary,
|
||||
"<tool_call>\n"
|
||||
" <function=binary_function>\n"
|
||||
" <parameter=data>\n"
|
||||
" SGVsbG8gV29ybGQhIFRoaXMgaXMgYmFzZTY0IGVuY29kZWQgdGV4dC4=\n"
|
||||
" </parameter>\n"
|
||||
" </function>\n"
|
||||
"</tool_call>",
|
||||
[&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
|
||||
|
||||
// Very large numbers (should be parsed as scientific notation)
|
||||
common_chat_msg expected_large_numbers;
|
||||
expected_large_numbers.role = "assistant";
|
||||
expected_large_numbers.tool_calls = {
|
||||
{ "number_function", "{\"big_int\":1e+60}", "" } // Large number becomes scientific notation
|
||||
};
|
||||
|
||||
test_parser_with_streaming(
|
||||
expected_large_numbers,
|
||||
"<tool_call>\n"
|
||||
" <function=number_function>\n"
|
||||
" <parameter=big_int>\n"
|
||||
" 999999999999999999999999999999999999999999999999999999999999\n"
|
||||
" </parameter>\n"
|
||||
" </function>\n"
|
||||
"</tool_call>",
|
||||
[&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
|
||||
}
|
||||
|
||||
{
|
||||
// Qwen3-Coder template
|
||||
auto tmpls = read_templates("models/templates/Qwen3-Coder.jinja");
|
||||
common_chat_templates_inputs inputs;
|
||||
inputs.messages = { message_user };
|
||||
|
||||
common_chat_tool qwen_union_tool {
|
||||
/* .name = */ "qwen_union",
|
||||
/* .description = */ "Test tool for union/anyOf handling",
|
||||
/* .parameters = */ R"({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"priority": { "type": ["number", "null"] },
|
||||
"maybe_text": { "anyOf": [ { "type": "string" } ] },
|
||||
"config": { "anyOf": [ { "type": "object" }, { "type": "null" } ] }
|
||||
},
|
||||
"required": []
|
||||
})",
|
||||
};
|
||||
inputs.tools = { qwen_union_tool };
|
||||
|
||||
auto params = common_chat_templates_apply(tmpls.get(), inputs);
|
||||
assert_equals(COMMON_CHAT_FORMAT_QWEN3_CODER_XML, params.format);
|
||||
assert_equals(false, params.grammar.empty());
|
||||
|
||||
// Grammar should compile successfully
|
||||
auto grammar = build_grammar(params.grammar);
|
||||
GGML_ASSERT(grammar && "Failed to build Qwen3-Coder grammar with union types");
|
||||
// Grammar and PEG parser should be generated with thinking_forced_open
|
||||
{
|
||||
common_chat_templates_inputs inputs;
|
||||
inputs.messages = { message_user };
|
||||
inputs.tools = { special_function_tool };
|
||||
auto params = common_chat_templates_apply(tmpls.get(), inputs);
|
||||
assert_equals(COMMON_CHAT_FORMAT_PEG_CONSTRUCTED, params.format);
|
||||
assert_equals(true, params.thinking_forced_open);
|
||||
assert_equals(false, params.grammar.empty());
|
||||
assert_equals(false, params.parser.empty());
|
||||
auto grammar = build_grammar(params.grammar);
|
||||
GGML_ASSERT(grammar && "Failed to build Step-3.5-Flash grammar");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3643,6 +3143,135 @@ static void test_template_output_peg_parsers() {
|
||||
});
|
||||
}
|
||||
|
||||
{
|
||||
// Qwen3-Coder
|
||||
auto tmpls = read_templates("models/templates/Qwen3-Coder.jinja");
|
||||
|
||||
// Test basic message
|
||||
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||
t.input = "Hello, world!\nWhat's up?";
|
||||
t.expect = message_assist;
|
||||
});
|
||||
|
||||
// Test tool call
|
||||
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||
t.input =
|
||||
"<tool_call>\n"
|
||||
"<function=special_function>\n"
|
||||
"<parameter=arg1>\n"
|
||||
"1\n"
|
||||
"</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>";
|
||||
t.params.tools = {special_function_tool};
|
||||
t.expect = message_assist_call;
|
||||
});
|
||||
|
||||
// Test parallel tool calls
|
||||
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||
t.input =
|
||||
"<tool_call>\n"
|
||||
"<function=special_function>\n"
|
||||
"<parameter=arg1>\n"
|
||||
"1\n"
|
||||
"</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>\n"
|
||||
"<tool_call>\n"
|
||||
"<function=special_function_with_opt>\n"
|
||||
"<parameter=arg1>\n"
|
||||
"1\n"
|
||||
"</parameter>\n"
|
||||
"<parameter=arg2>\n"
|
||||
"2\n"
|
||||
"</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>";
|
||||
t.params.parallel_tool_calls = true;
|
||||
t.params.tools = {special_function_tool, special_function_tool_with_optional_param};
|
||||
|
||||
t.expect.tool_calls = {{
|
||||
/* .name = */ "special_function",
|
||||
/* .arguments = */ R"({"arg1": 1})",
|
||||
/* .id = */ {},
|
||||
}, {
|
||||
/* .name = */ "special_function_with_opt",
|
||||
/* .arguments = */ R"({"arg1": 1, "arg2": 2})",
|
||||
/* .id = */ {},
|
||||
}};
|
||||
});
|
||||
|
||||
// Test tool call with string parameter
|
||||
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||
t.input =
|
||||
"<tool_call>\n"
|
||||
"<function=python>\n"
|
||||
"<parameter=code>\n"
|
||||
"def hello():\n"
|
||||
" print(\"Hello, world!\")\n"
|
||||
"\n"
|
||||
"hello()\n"
|
||||
"</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>";
|
||||
t.params.tools = {python_tool};
|
||||
|
||||
t.expect.tool_calls = {{
|
||||
/* .name = */ "python",
|
||||
/* .arguments = */ "{\"code\": \"def hello():\\n print(\\\"Hello, world!\\\")\\n\\nhello()\"}",
|
||||
/* .id = */ {},
|
||||
}};
|
||||
});
|
||||
|
||||
// Test tool call with JSON parameter
|
||||
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||
t.input =
|
||||
"<tool_call>\n"
|
||||
"<function=todo_list>\n"
|
||||
"<parameter=todos>\n"
|
||||
"[{\"item\": \"Check stuff\", \"selected\": false}, {\"item\": \"Prepare stuff\", \"selected\": true}]\n"
|
||||
"</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>";
|
||||
t.params.tools = {todo_list_tool};
|
||||
|
||||
t.expect.tool_calls = {{
|
||||
/* .name = */ "todo_list",
|
||||
/* .arguments = */ "{\"todos\": [{\"item\": \"Check stuff\", \"selected\": false}, {\"item\": \"Prepare stuff\", \"selected\": true}]}",
|
||||
/* .id = */ {},
|
||||
}};
|
||||
});
|
||||
|
||||
// Test tool call with string parameter and no closing </parameter> tag
|
||||
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||
t.input =
|
||||
"<tool_call>\n"
|
||||
"<function=python>\n"
|
||||
"<parameter=code>\n"
|
||||
"def hello():\n"
|
||||
" print(\"Hello, world!\")\n"
|
||||
"\n"
|
||||
"hello()\n"
|
||||
"</function>\n"
|
||||
"</tool_call>";
|
||||
t.params.tools = {python_tool};
|
||||
|
||||
t.expect.tool_calls = {{
|
||||
/* .name = */ "python",
|
||||
/* .arguments = */ "{\"code\": \"def hello():\\n print(\\\"Hello, world!\\\")\\n\\nhello()\"}",
|
||||
/* .id = */ {},
|
||||
}};
|
||||
});
|
||||
|
||||
// Test response format
|
||||
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||
t.input = R"({"amount": 123.45, "date": "2025-12-03"})";
|
||||
t.params.json_schema = invoice_schema;
|
||||
|
||||
t.expect.content = R"({"amount": 123.45, "date": "2025-12-03"})";
|
||||
});
|
||||
}
|
||||
|
||||
{
|
||||
// NVIDIA Nemotron-3 Nano
|
||||
auto tmpls = read_templates("models/templates/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.jinja");
|
||||
@@ -3799,6 +3428,196 @@ static void test_template_output_peg_parsers() {
|
||||
});
|
||||
}
|
||||
|
||||
{
|
||||
// Step-3.5-Flash (uses Nemotron v3 PEG parser with thinking_forced_open)
|
||||
// Unlike Nemotron, Step-3.5-Flash always emits <think> regardless of enable_thinking,
|
||||
// so all inputs must include a </think> delimiter.
|
||||
auto tmpls = read_templates("models/templates/stepfun-ai-Step-3.5-Flash.jinja");
|
||||
|
||||
// Test basic message with reasoning
|
||||
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||
t.input = "I'm\nthinking\n</think>\nHello, world!\nWhat's up?";
|
||||
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
||||
|
||||
t.expect = message_assist_thoughts;
|
||||
});
|
||||
|
||||
// Test basic message without thinking content
|
||||
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||
t.input = "</think>\nHello, world!\nWhat's up?";
|
||||
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
||||
|
||||
t.expect = message_assist;
|
||||
});
|
||||
|
||||
// Test tool call without thinking content
|
||||
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||
t.input =
|
||||
"</think>\n"
|
||||
"<tool_call>\n"
|
||||
"<function=special_function>\n"
|
||||
"<parameter=arg1>\n"
|
||||
"1\n"
|
||||
"</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>";
|
||||
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
||||
t.params.tools = {special_function_tool};
|
||||
|
||||
t.expect = message_assist_call;
|
||||
});
|
||||
|
||||
// Test tool call with thinking
|
||||
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||
t.input =
|
||||
"I'm\nthinking\n</think>\n"
|
||||
"<tool_call>\n"
|
||||
"<function=special_function>\n"
|
||||
"<parameter=arg1>\n"
|
||||
"1\n"
|
||||
"</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>";
|
||||
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
||||
t.params.tools = {special_function_tool};
|
||||
|
||||
t.expect = message_assist_call_thoughts;
|
||||
});
|
||||
|
||||
// Test parallel tool calls with thinking
|
||||
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||
t.input =
|
||||
"I'm\nthinking\n</think>\n"
|
||||
"<tool_call>\n"
|
||||
"<function=special_function>\n"
|
||||
"<parameter=arg1>\n"
|
||||
"1\n"
|
||||
"</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>\n"
|
||||
"<tool_call>\n"
|
||||
"<function=special_function_with_opt>\n"
|
||||
"<parameter=arg1>\n"
|
||||
"1\n"
|
||||
"</parameter>\n"
|
||||
"<parameter=arg2>\n"
|
||||
"2\n"
|
||||
"</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>";
|
||||
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
||||
t.params.parallel_tool_calls = true;
|
||||
t.params.tools = {special_function_tool, special_function_tool_with_optional_param};
|
||||
|
||||
t.expect.reasoning_content = "I'm\nthinking";
|
||||
t.expect.tool_calls = {{
|
||||
/* .name = */ "special_function",
|
||||
/* .arguments = */ R"({"arg1": 1})",
|
||||
/* .id = */ {},
|
||||
}, {
|
||||
/* .name = */ "special_function_with_opt",
|
||||
/* .arguments = */ R"({"arg1": 1, "arg2": 2})",
|
||||
/* .id = */ {},
|
||||
}};
|
||||
});
|
||||
|
||||
// Test parallel tool calls without thinking content
|
||||
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||
t.input =
|
||||
"</think>\n"
|
||||
"<tool_call>\n"
|
||||
"<function=special_function>\n"
|
||||
"<parameter=arg1>\n"
|
||||
"1\n"
|
||||
"</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>\n"
|
||||
"<tool_call>\n"
|
||||
"<function=special_function_with_opt>\n"
|
||||
"<parameter=arg1>\n"
|
||||
"1\n"
|
||||
"</parameter>\n"
|
||||
"<parameter=arg2>\n"
|
||||
"2\n"
|
||||
"</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>";
|
||||
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
||||
t.params.parallel_tool_calls = true;
|
||||
t.params.tools = {special_function_tool, special_function_tool_with_optional_param};
|
||||
|
||||
t.expect.tool_calls = {{
|
||||
/* .name = */ "special_function",
|
||||
/* .arguments = */ R"({"arg1": 1})",
|
||||
/* .id = */ {},
|
||||
}, {
|
||||
/* .name = */ "special_function_with_opt",
|
||||
/* .arguments = */ R"({"arg1": 1, "arg2": 2})",
|
||||
/* .id = */ {},
|
||||
}};
|
||||
});
|
||||
|
||||
// Test tool call with code string parameter
|
||||
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||
t.input =
|
||||
"</think>\n"
|
||||
"<tool_call>\n"
|
||||
"<function=python>\n"
|
||||
"<parameter=code>\n"
|
||||
"def hello():\n"
|
||||
" print(\"Hello, world!\")\n"
|
||||
"\n"
|
||||
"hello()\n"
|
||||
"</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>";
|
||||
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
||||
t.params.tools = {python_tool};
|
||||
|
||||
t.expect.tool_calls = {{
|
||||
/* .name = */ "python",
|
||||
/* .arguments = */ "{\"code\": \"def hello():\\n print(\\\"Hello, world!\\\")\\n\\nhello()\"}",
|
||||
/* .id = */ {},
|
||||
}};
|
||||
});
|
||||
|
||||
// Test tool call with string parameter and no closing </parameter> tag
|
||||
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||
t.input =
|
||||
"</think>\n"
|
||||
"<tool_call>\n"
|
||||
"<function=python>\n"
|
||||
"<parameter=code>\n"
|
||||
"def hello():\n"
|
||||
" print(\"Hello, world!\")\n"
|
||||
"\n"
|
||||
"hello()\n"
|
||||
"</function>\n"
|
||||
"</tool_call>";
|
||||
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
||||
t.params.tools = {python_tool};
|
||||
|
||||
t.expect.tool_calls = {{
|
||||
/* .name = */ "python",
|
||||
/* .arguments = */ "{\"code\": \"def hello():\\n print(\\\"Hello, world!\\\")\\n\\nhello()\"}",
|
||||
/* .id = */ {},
|
||||
}};
|
||||
});
|
||||
|
||||
// Test response format (JSON schema with thinking)
|
||||
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||
t.input =
|
||||
"I need to output the invoice details in JSON\n"
|
||||
"</think>\n"
|
||||
R"({"amount": 123.45, "date": "2025-12-03"})";
|
||||
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
||||
t.params.json_schema = invoice_schema;
|
||||
|
||||
t.expect.reasoning_content = "I need to output the invoice details in JSON";
|
||||
t.expect.content = R"({"amount": 123.45, "date": "2025-12-03"})";
|
||||
});
|
||||
}
|
||||
|
||||
{
|
||||
// Solar-Open-100B
|
||||
auto tmpls = read_templates("models/templates/upstage-Solar-Open-100B.jinja");
|
||||
|
||||
@@ -691,6 +691,48 @@ static void test_filters(testing & t) {
|
||||
"{\n \"a\": 1,\n \"b\": [\n 1,\n 2\n ]\n}"
|
||||
);
|
||||
|
||||
test_template(t, "indent",
|
||||
"{{ data|indent(2) }}",
|
||||
{{ "data", "foo\nbar" }},
|
||||
"foo\n bar"
|
||||
);
|
||||
|
||||
test_template(t, "indent first only",
|
||||
"{{ data|indent(width=3,first=true) }}",
|
||||
{{ "data", "foo\nbar" }},
|
||||
" foo\n bar"
|
||||
);
|
||||
|
||||
test_template(t, "indent blank lines and first line",
|
||||
"{{ data|indent(width=5,blank=true,first=true) }}",
|
||||
{{ "data", "foo\n\nbar" }},
|
||||
" foo\n \n bar"
|
||||
);
|
||||
|
||||
test_template(t, "indent with default width",
|
||||
"{{ data|indent() }}",
|
||||
{{ "data", "foo\nbar" }},
|
||||
"foo\n bar"
|
||||
);
|
||||
|
||||
test_template(t, "indent with no newline",
|
||||
"{{ data|indent }}",
|
||||
{{ "data", "foo" }},
|
||||
"foo"
|
||||
);
|
||||
|
||||
test_template(t, "indent with trailing newline",
|
||||
"{{ data|indent(blank=true) }}",
|
||||
{{ "data", "foo\n" }},
|
||||
"foo\n "
|
||||
);
|
||||
|
||||
test_template(t, "indent with string",
|
||||
"{{ data|indent(width='>>>>') }}",
|
||||
{{ "data", "foo\nbar" }},
|
||||
"foo\n>>>>bar"
|
||||
);
|
||||
|
||||
test_template(t, "chained filters",
|
||||
"{{ ' HELLO '|trim|lower }}",
|
||||
json::object(),
|
||||
|
||||
@@ -24,6 +24,7 @@ add_library(mtmd
|
||||
models/llama4.cpp
|
||||
models/llava.cpp
|
||||
models/minicpmv.cpp
|
||||
models/paddleocr.cpp
|
||||
models/pixtral.cpp
|
||||
models/qwen2vl.cpp
|
||||
models/qwen3vl.cpp
|
||||
|
||||
@@ -229,6 +229,7 @@ enum projector_type {
|
||||
PROJECTOR_TYPE_MUSIC_FLAMINGO,
|
||||
PROJECTOR_TYPE_LFM2,
|
||||
PROJECTOR_TYPE_KIMIVL,
|
||||
PROJECTOR_TYPE_PADDLEOCR,
|
||||
PROJECTOR_TYPE_LIGHTONOCR,
|
||||
PROJECTOR_TYPE_COGVLM,
|
||||
PROJECTOR_TYPE_JANUS_PRO,
|
||||
@@ -264,6 +265,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||
{ PROJECTOR_TYPE_MUSIC_FLAMINGO, "musicflamingo"},
|
||||
{ PROJECTOR_TYPE_LFM2, "lfm2"},
|
||||
{ PROJECTOR_TYPE_KIMIVL, "kimivl"},
|
||||
{ PROJECTOR_TYPE_PADDLEOCR, "paddleocr"},
|
||||
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
|
||||
{ PROJECTOR_TYPE_COGVLM, "cogvlm"},
|
||||
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
|
||||
|
||||
@@ -628,9 +628,6 @@ ggml_tensor * clip_graph::build_attn(
|
||||
ggml_tensor * v = ggml_permute(ctx0, v_cur, 1, 2, 0, 3);
|
||||
v = ggml_cont(ctx0, v);
|
||||
|
||||
const auto n_tokens = q->ne[1];
|
||||
const auto n_head = q->ne[2];
|
||||
|
||||
ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
|
||||
// F32 may not needed for vision encoders?
|
||||
// ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
||||
@@ -639,7 +636,7 @@ ggml_tensor * clip_graph::build_attn(
|
||||
|
||||
ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
|
||||
cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
|
||||
cur = ggml_cont_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens);
|
||||
cur = ggml_cont_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2] * cur->ne[3]);
|
||||
}
|
||||
|
||||
cb(cur, "kqv_out", il);
|
||||
@@ -844,6 +841,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
{
|
||||
builder = std::make_unique<clip_graph_kimivl>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_paddleocr>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_KIMIK25:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_kimik25>(ctx, img);
|
||||
@@ -1259,6 +1260,14 @@ struct clip_model_loader {
|
||||
hparams.audio_window_len = 400;
|
||||
hparams.audio_hop_len = 160;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
{
|
||||
hparams.n_merge = 2;
|
||||
get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels);
|
||||
get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels);
|
||||
|
||||
hparams.set_warmup_n_tokens(28*28); // avoid OOM on warmup
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LFM2A:
|
||||
{
|
||||
// audio preprocessing params
|
||||
@@ -1707,6 +1716,7 @@ struct clip_model_loader {
|
||||
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_KIMIVL:
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
case PROJECTOR_TYPE_KIMIK25:
|
||||
{
|
||||
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);
|
||||
@@ -2993,6 +3003,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
{
|
||||
GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
|
||||
clip_image_u8 resized;
|
||||
@@ -3333,6 +3344,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
return (img->nx / params.patch_size) / 2;
|
||||
default:
|
||||
@@ -3349,6 +3361,7 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
return (img->ny / params.patch_size) / 2;
|
||||
default:
|
||||
@@ -3446,6 +3459,13 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
int y_patch = CLIP_ALIGN(img->ny, out_patch_size) / out_patch_size;
|
||||
n_patches = x_patch * y_patch;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
{
|
||||
// dynamic size
|
||||
int n_merge = ctx->model.hparams.n_merge;
|
||||
int stride = n_merge * n_merge;
|
||||
n_patches = CLIP_ALIGN(n_patches, stride) / stride;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PIXTRAL:
|
||||
case PROJECTOR_TYPE_LIGHTONOCR:
|
||||
{
|
||||
@@ -3693,6 +3713,30 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
}
|
||||
}
|
||||
|
||||
set_input_i32("positions", positions);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
{
|
||||
const int merge_ratio = hparams.n_merge;
|
||||
const int pw = image_size_width / patch_size;
|
||||
const int ph = image_size_height / patch_size;
|
||||
std::vector<int> positions(n_pos * 4);
|
||||
int ptr = 0;
|
||||
// NOTE: same as Qwen-VL, but x and y are swapped
|
||||
for (int y = 0; y < ph; y += merge_ratio) {
|
||||
for (int dy = 0; dy < 2; dy++) {
|
||||
for (int x = 0; x < pw; x += merge_ratio) {
|
||||
for (int dx = 0; dx < 2; dx++) {
|
||||
positions[ ptr] = y + dy;
|
||||
positions[ num_patches + ptr] = x + dx;
|
||||
positions[2 * num_patches + ptr] = y + dy;
|
||||
positions[3 * num_patches + ptr] = x + dx;
|
||||
ptr++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
set_input_i32("positions", positions);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
@@ -4006,6 +4050,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||
return ctx->model.mm_2_w->ne[1];
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
case PROJECTOR_TYPE_KIMIVL:
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
case PROJECTOR_TYPE_KIMIK25:
|
||||
return ctx->model.mm_2_w->ne[1];
|
||||
case PROJECTOR_TYPE_COGVLM:
|
||||
|
||||
@@ -57,6 +57,11 @@ struct clip_graph_kimivl : clip_graph {
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_paddleocr : clip_graph {
|
||||
clip_graph_paddleocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_cogvlm : clip_graph {
|
||||
clip_graph_cogvlm(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
|
||||
52
tools/mtmd/models/paddleocr.cpp
Normal file
52
tools/mtmd/models/paddleocr.cpp
Normal file
@@ -0,0 +1,52 @@
|
||||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_paddleocr::build() {
|
||||
const int n_pos = n_patches;
|
||||
const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
|
||||
|
||||
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
|
||||
|
||||
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
|
||||
ggml_set_name(positions, "positions");
|
||||
ggml_set_input(positions);
|
||||
|
||||
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
|
||||
return ggml_rope_multi(
|
||||
ctx0, cur, positions, nullptr,
|
||||
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION,
|
||||
32768, 10000, 1, 0, 1, 32, 1);
|
||||
};
|
||||
|
||||
ggml_tensor * learned_pos_embd = resize_position_embeddings();
|
||||
ggml_tensor * inp = build_inp();
|
||||
ggml_tensor * cur = build_vit(
|
||||
inp, n_patches,
|
||||
NORM_TYPE_NORMAL,
|
||||
hparams.ffn_op,
|
||||
learned_pos_embd,
|
||||
add_pos);
|
||||
|
||||
cb(cur, "vit_out", -1);
|
||||
|
||||
{
|
||||
// mlp_AR paddleocr projector
|
||||
float proj_norm_eps = 1e-5;
|
||||
cur = build_norm(cur,
|
||||
model.mm_input_norm_w, model.mm_input_norm_b,
|
||||
NORM_TYPE_NORMAL, proj_norm_eps, -1);
|
||||
|
||||
const int scale_factor = model.hparams.n_merge;
|
||||
cur = build_patch_merge_permute(cur, scale_factor);
|
||||
cur = build_ffn(cur,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_2_w, model.mm_2_b,
|
||||
hparams.ffn_op, -1);
|
||||
cb(cur, "mlp_out", -1);
|
||||
}
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
@@ -175,7 +175,7 @@ struct mtmd_context {
|
||||
|
||||
clip_context_params ctx_clip_params {
|
||||
/* use_gpu */ ctx_params.use_gpu,
|
||||
/* flash_attn_type */ CLIP_FLASH_ATTN_TYPE_AUTO,
|
||||
/* flash_attn_type */ mtmd_get_clip_flash_attn_type(ctx_params.flash_attn_type),
|
||||
/* image_min_tokens */ ctx_params.image_min_tokens,
|
||||
/* image_max_tokens */ ctx_params.image_max_tokens,
|
||||
/* warmup */ ctx_params.warmup,
|
||||
@@ -325,6 +325,10 @@ struct mtmd_context {
|
||||
img_beg = "<|begin_of_image|>";
|
||||
img_end = "<|end_of_image|>";
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_PADDLEOCR) {
|
||||
// <|IMAGE_START|> ... (image embeddings) ... <|IMAGE_END|>
|
||||
img_beg = "<|IMAGE_START|>";
|
||||
img_end = "<|IMAGE_END|>";
|
||||
}
|
||||
}
|
||||
|
||||
@@ -890,6 +894,7 @@ bool mtmd_decode_use_mrope(mtmd_context * ctx) {
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
|
||||
@@ -28,6 +28,14 @@ if [ "${1:-}" = "huge" ]; then
|
||||
echo "Include BIG and HUGE models..."
|
||||
fi
|
||||
|
||||
# Check if the second argument is "flash", then enable flash attention
|
||||
# This is useful to test if flash attention off works correctly
|
||||
FLASH_ATTN="on"
|
||||
if [ "${2:-}" = "flash_off" ] || [ "${1:-}" = "flash_off" ]; then
|
||||
FLASH_ATTN="off"
|
||||
echo "Flash attention disabled..."
|
||||
fi
|
||||
|
||||
###############
|
||||
|
||||
arr_prefix=()
|
||||
@@ -143,6 +151,7 @@ for i in "${!arr_hf[@]}"; do
|
||||
-hf $(printf %q "$hf") \
|
||||
--image $(printf %q "$SCRIPT_DIR/$inp_file") \
|
||||
--temp 0 -n 128 \
|
||||
--flash-attn $(printf %q "$FLASH_ATTN") \
|
||||
${extra_args}"
|
||||
|
||||
# if extra_args does not contain -p, we add a default prompt
|
||||
|
||||
@@ -120,7 +120,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
|
||||
static void usage(const char * executable) {
|
||||
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights]\n", executable);
|
||||
printf(" [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--tensor-type-file]\n");
|
||||
printf(" [--prune-layers] [--keep-split] [--override-kv]\n");
|
||||
printf(" [--prune-layers] [--keep-split] [--override-kv] [--dry-run]\n");
|
||||
printf(" model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
|
||||
printf(" --allow-requantize\n");
|
||||
printf(" allow requantizing tensors that have already been quantized\n");
|
||||
@@ -156,7 +156,10 @@ static void usage(const char * executable) {
|
||||
printf(" generate quantized model in the same shards as input\n");
|
||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||
printf(" override model metadata by key in the quantized model. may be specified multiple times.\n");
|
||||
printf(" WARNING: this is an advanced option, use with care.\n\n");
|
||||
printf(" WARNING: this is an advanced option, use with care.\n");
|
||||
printf(" --dry-run\n");
|
||||
printf(" calculate and show the final quantization size without performing quantization\n");
|
||||
printf(" example: llama-quantize --dry-run model-f32.gguf Q4_K\n\n");
|
||||
printf("note: --include-weights and --exclude-weights cannot be used together\n\n");
|
||||
printf("-----------------------------------------------------------------------------\n");
|
||||
printf(" allowed quantization types\n");
|
||||
@@ -532,6 +535,8 @@ int main(int argc, char ** argv) {
|
||||
if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) {
|
||||
usage(argv[0]);
|
||||
}
|
||||
} else if (strcmp(argv[arg_idx], "--dry-run") == 0) {
|
||||
params.dry_run = true;
|
||||
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
|
||||
params.allow_requantize = true;
|
||||
} else if (strcmp(argv[arg_idx], "--pure") == 0) {
|
||||
@@ -630,22 +635,26 @@ int main(int argc, char ** argv) {
|
||||
std::string ftype_str;
|
||||
std::string suffix = ".gguf";
|
||||
if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
|
||||
std::string fpath;
|
||||
const size_t pos = fname_inp.find_last_of("/\\");
|
||||
if (pos != std::string::npos) {
|
||||
fpath = fname_inp.substr(0, pos + 1);
|
||||
}
|
||||
// argv[arg_idx] is the ftype directly: <input> <ftype>
|
||||
if (!params.dry_run) {
|
||||
std::string fpath;
|
||||
const size_t pos = fname_inp.find_last_of("/\\");
|
||||
if (pos != std::string::npos) {
|
||||
fpath = fname_inp.substr(0, pos + 1);
|
||||
}
|
||||
|
||||
// export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting
|
||||
fname_out = fpath + "ggml-model-" + ftype_str;
|
||||
if (!params.keep_split) {
|
||||
fname_out += suffix;
|
||||
// export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting
|
||||
fname_out = fpath + "ggml-model-" + ftype_str;
|
||||
if (!params.keep_split) {
|
||||
fname_out += suffix;
|
||||
}
|
||||
}
|
||||
arg_idx++;
|
||||
if (ftype_str == "COPY") {
|
||||
params.only_copy = true;
|
||||
}
|
||||
} else {
|
||||
// argv[arg_idx] is not a valid ftype, so treat it as output path: <input> <output> <ftype>
|
||||
fname_out = argv[arg_idx];
|
||||
if (params.keep_split && fname_out.find(suffix) != std::string::npos) {
|
||||
fname_out = fname_out.substr(0, fname_out.length() - suffix.length());
|
||||
@@ -677,25 +686,33 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
|
||||
params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S ||
|
||||
params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S ||
|
||||
params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
|
||||
params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && imatrix_data.empty()) {
|
||||
if (!params.dry_run &&
|
||||
(
|
||||
params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
|
||||
params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S ||
|
||||
params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M
|
||||
) && imatrix_data.empty()) {
|
||||
fprintf(stderr, "\n==========================================================================================================\n");
|
||||
fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
|
||||
fprintf(stderr, "==========================================================================================================\n\n\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (std::error_code ec; std::filesystem::equivalent(fname_inp, fname_out, ec)) {
|
||||
fprintf(stderr, "%s: error: input and output files are the same: '%s'\n", __func__, fname_inp.c_str());
|
||||
return 1;
|
||||
if (!params.dry_run) {
|
||||
if (std::error_code ec; std::filesystem::equivalent(fname_inp, fname_out, ec)) {
|
||||
fprintf(stderr, "%s: error: input and output files are the same: '%s'\n", __func__, fname_inp.c_str());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
print_build_info();
|
||||
|
||||
fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
|
||||
if (params.dry_run) {
|
||||
fprintf(stderr, "%s: calculating quantization size for '%s' as %s", __func__, fname_inp.c_str(), ftype_str.c_str());
|
||||
} else {
|
||||
fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
|
||||
}
|
||||
|
||||
if (params.nthread > 0) {
|
||||
fprintf(stderr, " using %d threads", params.nthread);
|
||||
}
|
||||
|
||||
Binary file not shown.
@@ -916,8 +916,7 @@ json oaicompat_chat_params_parse(
|
||||
json image_url = json_value(p, "image_url", json::object());
|
||||
handle_media(out_files, image_url, opt.media_path);
|
||||
|
||||
// replace this chunk with a marker
|
||||
p["type"] = "text";
|
||||
p["type"] = "media_marker";
|
||||
p["text"] = mtmd_default_marker();
|
||||
p.erase("image_url");
|
||||
|
||||
@@ -938,8 +937,7 @@ json oaicompat_chat_params_parse(
|
||||
|
||||
// TODO: add audio_url support by reusing handle_media()
|
||||
|
||||
// replace this chunk with a marker
|
||||
p["type"] = "text";
|
||||
p["type"] = "media_marker";
|
||||
p["text"] = mtmd_default_marker();
|
||||
p.erase("input_audio");
|
||||
|
||||
|
||||
@@ -42,7 +42,13 @@
|
||||
useGlobalSelection = false
|
||||
}: Props = $props();
|
||||
|
||||
let options = $derived(modelOptions());
|
||||
let options = $derived(
|
||||
modelOptions().filter((option) => {
|
||||
const modelProps = modelsStore.getModelProps(option.model);
|
||||
|
||||
return modelProps?.webui !== false;
|
||||
})
|
||||
);
|
||||
let loading = $derived(modelsLoading());
|
||||
let updating = $derived(modelsUpdating());
|
||||
let activeId = $derived(selectedModelId());
|
||||
@@ -245,6 +251,9 @@
|
||||
return options.find((option) => option.id === activeId);
|
||||
}
|
||||
|
||||
if (options.length === 1) {
|
||||
return options[0];
|
||||
}
|
||||
// No selection - return undefined to show "Select model"
|
||||
return undefined;
|
||||
}
|
||||
|
||||
@@ -498,7 +498,8 @@ class ChatStore {
|
||||
MessageRole.USER,
|
||||
content,
|
||||
MessageType.TEXT,
|
||||
parentIdForUserMessage ?? '-1'
|
||||
parentIdForUserMessage ?? '-1',
|
||||
extras
|
||||
);
|
||||
if (isNewConversation && content)
|
||||
await conversationsStore.updateConversationName(currentConv.id, content.trim());
|
||||
|
||||
Reference in New Issue
Block a user