Compare commits

..

1 Commits

Author SHA1 Message Date
Ruben Ortlam
d5344395d0 benchmark 2026-04-08 18:26:50 +02:00
211 changed files with 12462 additions and 5573 deletions

8
.github/labeler.yml vendored
View File

@@ -73,18 +73,10 @@ android:
- changed-files:
- any-glob-to-any-file:
- examples/llama.android/**
server/webui:
- changed-files:
- any-glob-to-any-file:
- tools/server/webui/**
- tools/server/public/**
server:
- changed-files:
- any-glob-to-any-file:
- tools/server/**
ggml:
- changed-files:
- any-glob-to-any-file:

View File

@@ -2348,21 +2348,19 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_env("LLAMA_ARG_N_GPU_LAYERS"));
add_opt(common_arg(
{"-sm", "--split-mode"}, "{none,layer,row,tensor}",
{"-sm", "--split-mode"}, "{none,layer,row}",
"how to split the model across multiple GPUs, one of:\n"
"- none: use one GPU only\n"
"- layer (default): split layers and KV across GPUs (pipelined)\n"
"- row: split weight across GPUs by rows (parallelized)\n"
"- tensor: split weights and KV across GPUs (parallelized)",
"- layer (default): split layers and KV across GPUs\n"
"- row: split rows across GPUs",
[](common_params & params, const std::string & value) {
if (value == "none") {
std::string arg_next = value;
if (arg_next == "none") {
params.split_mode = LLAMA_SPLIT_MODE_NONE;
} else if (value == "layer") {
} else if (arg_next == "layer") {
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
} else if (value == "row") {
} else if (arg_next == "row") {
params.split_mode = LLAMA_SPLIT_MODE_ROW;
} else if (value == "tensor") {
params.split_mode = LLAMA_SPLIT_MODE_TENSOR;
} else {
throw std::invalid_argument("invalid value");
}

View File

@@ -332,36 +332,58 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
const auto & inputs = ctx.inputs;
bool force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
auto until_suffix = p.rule("until-suffix", p.until(arguments.value_suffix));
common_peg_parser tool_choice = p.choice();
foreach_function(inputs.tools, [&](const json & tool) {
const auto & func = tool.at("function");
std::string name = func.at("name");
auto params = func.contains("parameters") ? func.at("parameters") : json::object();
const auto & params = func.contains("parameters") ? func.at("parameters") : json::object();
const auto & properties = params.contains("properties") ? params.at("properties") : json::object();
std::set<std::string> required;
if (params.contains("required")) {
params.at("required").get_to(required);
}
auto schema_info = common_schema_info();
schema_info.resolve_refs(params);
// Build parser for each argument, separating required and optional
std::vector<common_peg_parser> required_parsers;
std::vector<common_peg_parser> optional_parsers;
for (const auto & [param_name, param_schema] : properties.items()) {
bool is_required = required.find(param_name) != required.end();
bool is_required = required.find(param_name) != required.end();
std::string type = "object";
if (param_schema.contains("type")) {
const auto & type_obj = param_schema.at("type");
if (type_obj.is_string()) {
type_obj.get_to(type);
} else if (type_obj.is_array()) {
// Handle nullable types like ["string", "null"]
for (const auto & t : type_obj) {
if (t.is_string() && t.get<std::string>() != "null") {
type = t.get<std::string>();
break;
}
}
} else if (type_obj.is_object()) {
if (type_obj.contains("type") && type_obj.at("type").is_string()) {
type_obj.at("type").get_to(type);
}
}
}
// Infer string type from enum values when type is unspecified
if (type == "object" && param_schema.contains("enum")) {
const auto & enum_vals = param_schema.at("enum");
if (enum_vals.is_array()) {
for (const auto & v : enum_vals) {
if (v.is_string()) {
type = "string";
break;
}
}
}
}
auto arg =
p.tool_arg(p.tool_arg_open(arguments.name_prefix + p.tool_arg_name(p.literal(param_name)) +
arguments.name_suffix) +
arguments.value_prefix +
(schema_info.resolves_to_string(param_schema) ?
p.tool_arg_string_value(p.schema(until_suffix,
(type == "string" ?
p.tool_arg_string_value(p.schema(p.until(arguments.value_suffix),
"tool-" + name + "-arg-" + param_name + "-schema",
param_schema, true)) :
p.tool_arg_json_value(p.schema(
@@ -392,7 +414,7 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
for (const auto & opt : optional_parsers) {
any_opt |= opt;
}
args_seq = args_seq + p.repeat(p.space() + any_opt, 0, -1);
args_seq = args_seq + p.repeat(p.space() + any_opt, 0, (int) optional_parsers.size());
}
if (!arguments.start.empty()) {

View File

@@ -1124,7 +1124,7 @@ static common_chat_params common_chat_params_init_gemma4(const common_chat_templ
p.rule("gemma4-bool", p.json_bool());
p.rule("gemma4-null", p.json_null());
p.rule("gemma4-number", p.json_number());
p.rule("gemma4-dict-key", p.rule("gemma4-dict-key-name", p.chars("[^:}]", 1, -1)) + p.literal(":"));
p.rule("gemma4-dict-key", p.rule("gemma4-dict-key-name", p.until(":")) + p.literal(":"));
p.rule("gemma4-dict-kv", p.ref("gemma4-dict-key") + p.space() + p.ref("gemma4-value"));
p.rule("gemma4-dict", [&]() {
auto ws = p.space();
@@ -1963,7 +1963,7 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
params.add_generation_prompt = true;
std::string gen_prompt = common_chat_template_direct_apply_impl(tmpl, params);
auto diff = calculate_diff_split(no_gen_prompt, gen_prompt);
params.generation_prompt = diff.right + diff.suffix;
params.generation_prompt = diff.right;
params.add_generation_prompt = inputs.add_generation_prompt;

View File

@@ -591,10 +591,6 @@ static hf_cache::hf_file find_best_model(const hf_cache::hf_files & files,
for (const auto & f : files) {
if (gguf_filename_is_model(f.path) &&
std::regex_search(f.path, pattern)) {
auto split = get_gguf_split_info(f.path);
if (split.count > 1 && split.index != 1) {
continue;
}
return f;
}
}
@@ -604,10 +600,6 @@ static hf_cache::hf_file find_best_model(const hf_cache::hf_files & files,
if (tag.empty()) {
for (const auto & f : files) {
if (gguf_filename_is_model(f.path)) {
auto split = get_gguf_split_info(f.path);
if (split.count > 1 && split.index != 1) {
continue;
}
return f;
}
}
@@ -626,7 +618,6 @@ static void list_available_gguf_files(const hf_cache::hf_files & files) {
}
struct hf_plan {
hf_cache::hf_file primary;
hf_cache::hf_files model_files;
hf_cache::hf_file mmproj;
};
@@ -672,7 +663,6 @@ static hf_plan get_hf_plan(const common_params_model & model,
}
}
plan.primary = primary;
plan.model_files = get_split_files(all, primary);
if (opts.download_mmproj) {
@@ -759,7 +749,7 @@ common_download_model_result common_download_model(const common_params_model
for (const auto & f : hf.model_files) {
hf_cache::finalize_file(f);
}
result.model_path = hf.primary.final_path;
result.model_path = hf.model_files[0].final_path;
if (!hf.mmproj.path.empty()) {
result.mmproj_path = hf_cache::finalize_file(hf.mmproj);

View File

@@ -251,23 +251,6 @@ value binary_expression::execute_impl(context & ctx) {
return res;
}
// Python-style string repetition
// TODO: support array/tuple repetition (e.g., [1, 2] * 3 → [1, 2, 1, 2, 1, 2])
if (op.value == "*" &&
((is_val<value_string>(left_val) && is_val<value_int>(right_val)) ||
(is_val<value_int>(left_val) && is_val<value_string>(right_val)))) {
const auto & str = is_val<value_string>(left_val) ? left_val->as_string() : right_val->as_string();
const int64_t repeat = is_val<value_int>(right_val) ? right_val->as_int() : left_val->as_int();
auto res = mk_val<value_string>();
if (repeat <= 0) {
return res;
}
for (int64_t i = 0; i < repeat; ++i) {
res->val_str = res->val_str.append(str);
}
return res;
}
// String membership
if (is_val<value_string>(left_val) && is_val<value_string>(right_val)) {
// case: "a" in "abc"

View File

@@ -1,5 +1,4 @@
#include "runtime.h"
#include "unicode.h"
#include "value.h"
// for converting from JSON to jinja values
@@ -155,83 +154,6 @@ static value test_compare_fn(const func_args & args) {
return mk_val<value_bool>(value_compare(args.get_pos(0), args.get_pos(1), op));
}
static void append_codepoint_as_ascii_json_escape(std::string & out, uint32_t codepoint) {
auto append_u16 = [&out](uint32_t value) {
char buf[8];
snprintf(buf, sizeof(buf), "\\u%04x", static_cast<unsigned int>(value));
out += buf;
};
if (codepoint <= 0xFFFF) {
append_u16(codepoint);
return;
}
codepoint -= 0x10000;
append_u16(0xD800 + ((codepoint >> 10) & 0x3FF));
append_u16(0xDC00 + (codepoint & 0x3FF));
}
static std::string json_ensure_ascii_preserving_format(const std::string & json_str) {
std::string output;
output.reserve(json_str.size());
bool in_string = false;
bool escaped = false;
for (size_t pos = 0; pos < json_str.size();) {
const char ch = json_str[pos];
if (!in_string) {
output.push_back(ch);
if (ch == '"') {
in_string = true;
}
++pos;
continue;
}
if (escaped) {
output.push_back(ch);
escaped = false;
++pos;
continue;
}
if (ch == '\\') {
output.push_back(ch);
escaped = true;
++pos;
continue;
}
if (ch == '"') {
output.push_back(ch);
in_string = false;
++pos;
continue;
}
const unsigned char uch = static_cast<unsigned char>(ch);
if (uch < 0x80) {
output.push_back(ch);
++pos;
continue;
}
auto parsed = common_parse_utf8_codepoint(json_str, pos);
if (parsed.status != utf8_parse_result::SUCCESS) {
output += "\\ufffd";
++pos;
continue;
}
append_codepoint_as_ascii_json_escape(output, parsed.codepoint);
pos += parsed.bytes_consumed;
}
return output;
}
static value tojson(const func_args & args) {
args.ensure_count(1, 5);
value val_ascii = args.get_kwarg_or_pos("ensure_ascii", 1);
@@ -247,17 +169,16 @@ static value tojson(const func_args & args) {
if (is_val<value_int>(val_indent)) {
indent = static_cast<int>(val_indent->as_int());
}
if (val_ascii->as_bool()) { // undefined == false
throw not_implemented_exception("tojson ensure_ascii=true not implemented");
}
if (val_sort->as_bool()) { // undefined == false
throw not_implemented_exception("tojson sort_keys=true not implemented");
}
const bool ensure_ascii = val_ascii->as_bool(); // undefined == false
auto separators = (is_val<value_array>(val_separators) ? val_separators : mk_val<value_array>())->as_array();
std::string item_sep = separators.size() > 0 ? separators[0]->as_string().str() : (indent < 0 ? ", " : ",");
std::string key_sep = separators.size() > 1 ? separators[1]->as_string().str() : ": ";
std::string json_str = value_to_json(args.get_pos(0), indent, item_sep, key_sep);
if (ensure_ascii) {
json_str = json_ensure_ascii_preserving_format(json_str);
}
return mk_val<value_string>(json_str);
}
@@ -539,10 +460,6 @@ const func_builtins & value_int_t::get_builtins() const {
int64_t val = args.get_pos(0)->as_int();
return mk_val<value_int>(val < 0 ? -val : val);
}},
{"int", [](const func_args & args) -> value {
args.ensure_vals<value_int>();
return mk_val<value_int>(args.get_pos(0)->as_int());
}},
{"float", [](const func_args & args) -> value {
args.ensure_vals<value_int>();
double val = static_cast<double>(args.get_pos(0)->as_int());
@@ -569,10 +486,6 @@ const func_builtins & value_float_t::get_builtins() const {
int64_t val = static_cast<int64_t>(args.get_pos(0)->as_float());
return mk_val<value_int>(val);
}},
{"float", [](const func_args & args) -> value {
args.ensure_vals<value_float>();
return mk_val<value_float>(args.get_pos(0)->as_float());
}},
{"safe", tojson},
{"string", tojson},
{"tojson", tojson},

View File

@@ -1229,15 +1229,15 @@ class TextModel(ModelBase):
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) # ty: ignore[unresolved-attribute]
assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute]
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
assert max(tokenizer.vocab.values()) < vocab_size
tokpre = self.get_vocab_base_pre(tokenizer)
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute]
added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
added_vocab = tokenizer.get_added_vocab()
added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute]
added_tokens_decoder = tokenizer.added_tokens_decoder
for i in range(vocab_size):
if i not in reverse_vocab:
@@ -1250,7 +1250,7 @@ class TextModel(ModelBase):
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
if not added_tokens_decoder[i].normalized:
previous_token = token
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) # ty: ignore[unresolved-attribute, invalid-assignment]
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
if previous_token != token:
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
@@ -1583,13 +1583,13 @@ class TextModel(ModelBase):
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
vocab_size = hparams["vocab_size"]
assert max(tokenizer.get_vocab().values()) < vocab_size # ty: ignore[unresolved-attribute]
assert max(tokenizer.get_vocab().values()) < vocab_size
tokpre = self.get_vocab_base_pre(tokenizer)
merges = []
vocab = {}
mergeable_ranks = tokenizer.mergeable_ranks # ty: ignore[unresolved-attribute]
mergeable_ranks = tokenizer.mergeable_ranks
for token, rank in mergeable_ranks.items():
vocab[QwenModel.token_bytes_to_string(token)] = rank
if len(token) == 1:
@@ -1599,7 +1599,7 @@ class TextModel(ModelBase):
merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
# for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
added_vocab = tokenizer.special_tokens # ty: ignore[unresolved-attribute]
added_vocab = tokenizer.special_tokens
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
for i in range(vocab_size):
@@ -1622,10 +1622,10 @@ class TextModel(ModelBase):
special_vocab.merges = merges
# only add special tokens when they were not already loaded from config.json
if len(special_vocab.special_token_ids) == 0:
special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
# this one is usually not in config.json anyway
special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_sentencepiece(self, add_to_gguf=True):
@@ -1877,10 +1877,10 @@ class TextModel(ModelBase):
self.gguf_writer.add_tokenizer_pre(tokpre)
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"])
special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_glm(self):
@@ -1894,10 +1894,10 @@ class TextModel(ModelBase):
self.gguf_writer.add_token_types(toktypes)
# Special tokens
# Note: Using <|endoftext|> (151329) for eot causes endless generation
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["[gMASK]"]) # ty: ignore[unresolved-attribute] # 151331
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # ty: ignore[unresolved-attribute] # 151336
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] # 151329
special_vocab._set_special_token("eom", tokenizer.get_added_vocab()["<|observation|>"]) # ty: ignore[unresolved-attribute] # 151338
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["[gMASK]"]) # 151331
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # 151336
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # 151329
special_vocab._set_special_token("eom", tokenizer.get_added_vocab()["<|observation|>"]) # 151338
special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_interns1(self):
@@ -1906,16 +1906,16 @@ class TextModel(ModelBase):
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab()) # ty: ignore[unresolved-attribute]
vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab())
vocab_size = self.hparams.get("vocab_size", len(vocab))
assert max(vocab.values()) < vocab_size
tokpre = self.get_vocab_base_pre(tokenizer)
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()}
added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]
added_vocab = tokenizer.get_added_vocab()
added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute]
added_tokens_decoder = tokenizer.added_tokens_decoder
for i in range(vocab_size):
if i not in reverse_vocab:
@@ -1928,7 +1928,7 @@ class TextModel(ModelBase):
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
if not added_tokens_decoder[i].normalized:
previous_token = token
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) # ty: ignore[unresolved-attribute, invalid-assignment]
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
if previous_token != token:
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
@@ -2219,10 +2219,10 @@ class MmprojModel(ModelBase):
self.image_size = self.find_vparam(["image_size"])
self.gguf_writer.add_vision_image_size(self.image_size)
self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"]))
self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size", "width", "vt_hidden_size"]))
self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size", "vt_hidden_size"]))
self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size", "vt_intermediate_size"]))
self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys))
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads", "heads", "vt_num_attention_heads"]))
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads", "vt_num_attention_heads"]))
# preprocessor config
image_mean = _MISTRAL_COMMON_DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"]
@@ -2516,15 +2516,15 @@ class XverseModel(TextModel):
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(dir_model)
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) # ty: ignore[unresolved-attribute]
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
# Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size,
# because vocab_size is the count of items, and indexes start at 0.
max_vocab_index = max(tokenizer.get_vocab().values()) # ty: ignore[unresolved-attribute]
max_vocab_index = max(tokenizer.get_vocab().values())
if max_vocab_index >= vocab_size:
raise ValueError("Vocabulary size exceeds expected maximum size.")
reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute]
added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]
reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
added_vocab = tokenizer.get_added_vocab()
for token_id in range(vocab_size):
token_text = reverse_vocab[token_id].encode('utf-8')
@@ -2535,7 +2535,7 @@ class XverseModel(TextModel):
elif re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
toktype = gguf.TokenType.BYTE # special
elif reverse_vocab[token_id] in added_vocab:
if tokenizer.added_tokens_decoder[token_id].special: # ty: ignore[unresolved-attribute]
if tokenizer.added_tokens_decoder[token_id].special:
toktype = gguf.TokenType.CONTROL
else:
toktype = gguf.TokenType.USER_DEFINED
@@ -3752,7 +3752,7 @@ class QwenModel(TextModel):
@staticmethod
def token_bytes_to_string(b):
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode # ty: ignore[unresolved-import]
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
byte_encoder = bytes_to_unicode()
return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
@@ -3777,14 +3777,7 @@ class QwenModel(TextModel):
self._set_vocab_qwen()
@ModelBase.register(
"Qwen2Model",
"Qwen2ForCausalLM",
"Qwen2AudioForConditionalGeneration",
"KORMoForCausalLM",
"AudioFlamingo3ForConditionalGeneration",
"DotsOCRForCausalLM",
)
@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration", "KORMoForCausalLM", "AudioFlamingo3ForConditionalGeneration")
class Qwen2Model(TextModel):
model_arch = gguf.MODEL_ARCH.QWEN2
@@ -3805,8 +3798,7 @@ class Qwen2Model(TextModel):
name = name.replace("language_model.", "") # for InternVL
if name.startswith("mlp") or name.startswith("multi_modal_projector") \
or name.startswith("vision_model") or name.startswith("audio_tower") \
or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector") \
or name.startswith("vision_tower."):
or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector"):
# skip vision and audio tensors
return
yield from super().modify_tensors(data_torch, name, bid)
@@ -3823,14 +3815,14 @@ class DreamModel(TextModel):
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
vocab_dict = tokenizer.get_vocab() # ty: ignore[unresolved-attribute]
vocab_dict = tokenizer.get_vocab()
vocab_size = self.hparams.get("vocab_size", len(vocab_dict))
assert max(vocab_dict.values()) < vocab_size
tokpre = self.get_vocab_base_pre(tokenizer)
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab_dict.items()}
added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]
added_vocab = tokenizer.get_added_vocab()
for i in range(vocab_size):
if i not in reverse_vocab:
@@ -3888,14 +3880,14 @@ class LLaDAModel(TextModel):
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
vocab_dict = tokenizer.get_vocab() # ty: ignore[unresolved-attribute]
vocab_dict = tokenizer.get_vocab()
vocab_size = self.hparams.get("vocab_size", len(vocab_dict))
assert max(vocab_dict.values()) < vocab_size
tokpre = self.get_vocab_base_pre(tokenizer)
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab_dict.items()}
added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]
added_vocab = tokenizer.get_added_vocab()
for i in range(vocab_size):
if i not in reverse_vocab:
@@ -4673,9 +4665,9 @@ class Qwen3Model(Qwen2Model):
self.is_rerank = True
self.is_tied_embeddings = self.hparams.get("tie_word_embeddings", False)
self.token_false_id = tokenizer.convert_tokens_to_ids("no") # ty: ignore[unresolved-attribute, invalid-assignment]
self.token_true_id = tokenizer.convert_tokens_to_ids("yes") # ty: ignore[unresolved-attribute, invalid-assignment]
self.sep_token_id = tokenizer.convert_tokens_to_ids("|") # ty: ignore[unresolved-attribute]
self.token_false_id = tokenizer.convert_tokens_to_ids("no")
self.token_true_id = tokenizer.convert_tokens_to_ids("yes")
self.sep_token_id = tokenizer.convert_tokens_to_ids("|")
assert self.token_false_id is not None and self.token_true_id is not None
@@ -4957,73 +4949,6 @@ class Glm4VVisionModel(Qwen3VLVisionModel):
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("StepVLForConditionalGeneration")
class Step3VLVisionModel(MmprojModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
assert self.hparams_vision is not None
if not self.hparams_vision.get("intermediate_size"):
hidden_size = self.hparams_vision.get("hidden_size") or self.hparams_vision.get("width") or 0
assert hidden_size > 0
mlp_ratio = float(self.hparams_vision.get("mlp_ratio", 8960 / 1536))
self.hparams_vision["intermediate_size"] = int(round(hidden_size * mlp_ratio))
self.preprocessor_config.setdefault("image_mean", list(_MISTRAL_COMMON_DATASET_MEAN))
self.preprocessor_config.setdefault("image_std", list(_MISTRAL_COMMON_DATASET_STD))
def set_gguf_parameters(self):
super().set_gguf_parameters()
assert self.hparams_vision is not None
projector_stride = int(self.global_config.get("understand_projector_stride", -1))
hidden_size = int(self.hparams_vision.get("hidden_size", self.hparams_vision.get("width", -1)))
num_layers = int(self.hparams_vision.get("num_hidden_layers", self.hparams_vision.get("layers", -1)))
assert (projector_stride, int(self.hparams_vision.get("image_size", -1)), hidden_size, num_layers) == (2, 728, 1536, 47), (
"current Step3-VL conversion path is only validated for Step3-VL-10B"
)
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.STEP3VL)
self.gguf_writer.add_vision_attention_layernorm_eps(float(self.hparams_vision.get("layer_norm_eps", 1e-5)))
self.gguf_writer.add_vision_projector_scale_factor(projector_stride ** 2)
# 3024 max resize comes from step3-vl-10b processing_step3.py.
self.gguf_writer.add_vision_preproc_image_size(3024)
def tensor_force_quant(self, name, new_name, bid, n_dims):
if ".position_embd." in new_name:
return gguf.GGMLQuantizationType.F32
return super().tensor_force_quant(name, new_name, bid, n_dims)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.startswith("model.") or name.startswith("lm_head."):
return
if name.startswith("vision_model.vit_downsampler"):
match = re.match(r"vision_model\.vit_downsampler(\d+)\.(weight|bias)", name)
if match is None:
raise ValueError(f"Unexpected Step3-VL projector tensor {name!r}")
proj_id = int(match.group(1)) - 1
suffix = f".{match.group(2)}"
yield (self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, proj_id, suffix=suffix), data_torch)
return
if name == "vit_large_projector.weight":
yield (self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ_FC), data_torch)
return
if name.startswith("vision_model."):
if name == "vision_model.positional_embedding":
name += ".weight"
elif name.endswith(".gamma") and ".ls_" in name:
name = name.removesuffix(".gamma") + ".weight"
name = name.replace("attn.in_proj_weight", "attn.in_proj.weight")
name = name.replace("attn.in_proj_bias", "attn.in_proj.bias")
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("Qwen3VLForConditionalGeneration")
class Qwen3VLTextModel(Qwen3Model):
model_arch = gguf.MODEL_ARCH.QWEN3VL
@@ -5044,16 +4969,6 @@ class Qwen3VLTextModel(Qwen3Model):
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("StepVLForConditionalGeneration")
class Step3VLTextModel(Qwen3Model):
model_arch = gguf.MODEL_ARCH.QWEN3
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.startswith("vision_model.") or name.startswith("model.vision_model.") or name.startswith("vit_large_projector."):
return
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("Qwen3VLMoeForConditionalGeneration")
class Qwen3VLMoeTextModel(Qwen3MoeModel):
model_arch = gguf.MODEL_ARCH.QWEN3VLMOE
@@ -5944,7 +5859,7 @@ class KimiLinearModel(TextModel):
# Build merges list using the approach similar to HunYuanMoE
merges = []
vocab = {}
mergeable_ranks = tokenizer.model._mergeable_ranks # ty: ignore[unresolved-attribute]
mergeable_ranks = tokenizer.model._mergeable_ranks
for token, rank in mergeable_ranks.items():
vocab[QwenModel.token_bytes_to_string(token)] = rank
if len(token) == 1:
@@ -5954,7 +5869,7 @@ class KimiLinearModel(TextModel):
merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
# Build token list
vocab_size = self.hparams["vocab_size"]
special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute]
special_tokens = tokenizer.special_tokens
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
tokens: list[str] = []
toktypes: list[int] = []
@@ -5980,7 +5895,7 @@ class KimiLinearModel(TextModel):
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
special_vocab.add_to_gguf(self.gguf_writer)
# override eos id in config.json with tiktoken eos id
self.gguf_writer.add_eos_token_id(tokenizer.eos_id) # ty: ignore[unresolved-attribute]
self.gguf_writer.add_eos_token_id(tokenizer.eos_id)
else:
raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!")
@@ -6474,11 +6389,11 @@ class BertModel(TextModel):
with open(tokenizer_config_path, "r", encoding="utf-8") as fp:
tokenizer_config_json = json.load(fp)
add_prefix = tokenizer.add_prefix_space # ty: ignore[unresolved-attribute]
remove_whitespaces = tokenizer.clean_up_tokenization_spaces # ty: ignore[unresolved-attribute]
add_prefix = tokenizer.add_prefix_space
remove_whitespaces = tokenizer.clean_up_tokenization_spaces
precompiled_charsmap = b64decode(tokenizer_json["normalizer"]["precompiled_charsmap"])
vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size) # ty: ignore[unresolved-attribute]
vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size)
else:
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
@@ -6495,7 +6410,7 @@ class BertModel(TextModel):
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
scores: list[float] = [-10000.0] * vocab_size
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size # ty: ignore[invalid-assignment]
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
if isinstance(tokenizer, SentencePieceProcessor):
for token_id in range(tokenizer.vocab_size()):
@@ -6517,20 +6432,20 @@ class BertModel(TextModel):
scores[token_id] = score
toktypes[token_id] = toktype
else:
added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]
added_vocab = tokenizer.get_added_vocab()
unk_token = tokenizer_config_json.get("unk_token")
unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3)) # ty: ignore[no-matching-overload]
unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3))
for token_id in range(tokenizer.vocab_size): # ty: ignore[unresolved-attribute]
piece = tokenizer._convert_id_to_token(token_id) # ty: ignore[unresolved-attribute]
if (piece := tokenizer._convert_id_to_token(token_id)) is not None: # ty: ignore[unresolved-attribute]
for token_id in range(tokenizer.vocab_size):
piece = tokenizer._convert_id_to_token(token_id)
if (piece := tokenizer._convert_id_to_token(token_id)) is not None:
text = piece.encode("utf-8")
score = tokenizer_json["model"]["vocab"][token_id][1]
toktype = SentencePieceTokenTypes.NORMAL
if token_id == unk_token_id:
toktype = SentencePieceTokenTypes.UNKNOWN
elif token_id in tokenizer.all_special_ids: # ty: ignore[unresolved-attribute]
elif token_id in tokenizer.all_special_ids:
toktype = SentencePieceTokenTypes.CONTROL
elif token_id in added_vocab.values():
toktype = SentencePieceTokenTypes.USER_DEFINED
@@ -8839,7 +8754,7 @@ class DeepseekV2Model(TextModel):
# Build merges list using the approach similar to HunYuanMoE
merges = []
vocab = {}
mergeable_ranks = tokenizer.model._mergeable_ranks # ty: ignore[unresolved-attribute]
mergeable_ranks = tokenizer.model._mergeable_ranks
for token, rank in mergeable_ranks.items():
vocab[QwenModel.token_bytes_to_string(token)] = rank
if len(token) == 1:
@@ -8850,7 +8765,7 @@ class DeepseekV2Model(TextModel):
# Build token list
vocab_size = self.hparams["vocab_size"]
special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute]
special_tokens = tokenizer.special_tokens
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
tokens: list[str] = []
toktypes: list[int] = []
@@ -9821,10 +9736,10 @@ class Glm4Model(TextModel):
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"])
special_vocab.add_to_gguf(self.gguf_writer)
def set_gguf_parameters(self):
@@ -10052,12 +9967,12 @@ class ChatGLMModel(TextModel):
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab())) # ty: ignore[unresolved-attribute]
assert max(tokenizer.get_vocab().values()) < vocab_size # ty: ignore[unresolved-attribute]
vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab()))
assert max(tokenizer.get_vocab().values()) < vocab_size
role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"]
special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens
for token_id in range(vocab_size):
piece = tokenizer._convert_id_to_token(token_id) # ty: ignore[unresolved-attribute]
piece = tokenizer._convert_id_to_token(token_id)
if token_id == 0:
piece = "<unk>"
elif token_id == 1:
@@ -10065,17 +9980,17 @@ class ChatGLMModel(TextModel):
elif token_id == 2:
piece = "<eos>"
text = piece.encode("utf-8") # ty: ignore[unresolved-attribute]
text = piece.encode("utf-8")
score = 0.0
# Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py),
# it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size()
if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size(): # ty: ignore[unresolved-attribute, invalid-argument-type]
score = tokenizer.tokenizer.sp_model.get_score(token_id) # ty: ignore[unresolved-attribute]
if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size():
score = tokenizer.tokenizer.sp_model.get_score(token_id)
if token_id >= tokenizer.tokenizer.sp_model.vocab_size(): # ty: ignore[unresolved-attribute]
if token_id >= tokenizer.tokenizer.sp_model.vocab_size():
if piece in special_tokens:
toktype = SentencePieceTokenTypes.CONTROL
elif len(piece) == 0: # ty: ignore[invalid-argument-type]
elif len(piece) == 0:
text = f"[PAD{token_id}]".encode("utf-8")
toktype = SentencePieceTokenTypes.UNUSED
else:
@@ -10086,13 +10001,13 @@ class ChatGLMModel(TextModel):
continue
toktype = SentencePieceTokenTypes.NORMAL
if tokenizer.tokenizer.sp_model.is_unknown(token_id): # ty: ignore[unresolved-attribute]
if tokenizer.tokenizer.sp_model.is_unknown(token_id):
toktype = SentencePieceTokenTypes.UNKNOWN
elif tokenizer.tokenizer.sp_model.is_control(token_id): # ty: ignore[unresolved-attribute]
elif tokenizer.tokenizer.sp_model.is_control(token_id):
toktype = SentencePieceTokenTypes.CONTROL
elif tokenizer.tokenizer.sp_model.is_unused(token_id): # ty: ignore[unresolved-attribute]
elif tokenizer.tokenizer.sp_model.is_unused(token_id):
toktype = SentencePieceTokenTypes.UNUSED
elif tokenizer.tokenizer.sp_model.is_byte(token_id): # ty: ignore[unresolved-attribute]
elif tokenizer.tokenizer.sp_model.is_byte(token_id):
toktype = SentencePieceTokenTypes.BYTE
tokens.append(text)
@@ -10112,7 +10027,7 @@ class ChatGLMModel(TextModel):
@staticmethod
def token_bytes_to_string(b):
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode # ty: ignore[unresolved-import]
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
byte_encoder = bytes_to_unicode()
return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
@@ -10146,7 +10061,7 @@ class ChatGLMModel(TextModel):
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
vocab_size = hparams.get("padded_vocab_size",hparams["vocab_size"])
assert max(tokenizer.get_vocab().values()) < vocab_size # ty: ignore[unresolved-attribute]
assert max(tokenizer.get_vocab().values()) < vocab_size
tokens, toktypes, tokpre = self.get_vocab_base()
self.gguf_writer.add_tokenizer_model("gpt2")
@@ -10155,10 +10070,10 @@ class ChatGLMModel(TextModel):
self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
# only add special tokens when they were not already loaded from config.json
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
# this one is usually not in config.json anyway
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
special_vocab.add_to_gguf(self.gguf_writer)
def set_gguf_parameters(self):
@@ -11424,7 +11339,7 @@ class HunYuanMoEModel(TextModel):
# 2. Reverse-engineer the merges list from mergeable_ranks
merges = []
vocab = {}
mergeable_ranks = tokenizer.mergeable_ranks # ty: ignore[unresolved-attribute]
mergeable_ranks = tokenizer.mergeable_ranks
for token, rank in mergeable_ranks.items():
vocab[QwenModel.token_bytes_to_string(token)] = rank
if len(token) == 1:
@@ -11435,8 +11350,8 @@ class HunYuanMoEModel(TextModel):
# 3. Generate the tokens and toktypes lists
vocab_size = self.hparams["vocab_size"]
assert tokenizer.vocab_size == vocab_size # ty: ignore[unresolved-attribute]
special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute]
assert tokenizer.vocab_size == vocab_size
special_tokens = tokenizer.special_tokens
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
tokens: list[str] = []
toktypes: list[int] = []
@@ -11660,7 +11575,7 @@ class HunYuanModel(TextModel):
# 2. Reverse-engineer the merges list from mergeable_ranks
merges = []
vocab = {}
mergeable_ranks = tokenizer.mergeable_ranks # ty: ignore[unresolved-attribute]
mergeable_ranks = tokenizer.mergeable_ranks
for token, rank in mergeable_ranks.items():
vocab[QwenModel.token_bytes_to_string(token)] = rank
if len(token) == 1:
@@ -11671,8 +11586,8 @@ class HunYuanModel(TextModel):
# 3. Generate the tokens and toktypes lists
vocab_size = self.hparams["vocab_size"]
assert tokenizer.vocab_size == vocab_size # ty: ignore[unresolved-attribute]
special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute]
assert tokenizer.vocab_size == vocab_size
special_tokens = tokenizer.special_tokens
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
tokens: list[str] = []
toktypes: list[int] = []
@@ -12820,44 +12735,13 @@ class SolarOpenModel(Glm4MoeModel):
self.gguf_writer.add_tokenizer_pre(tokpre)
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<unk>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|startoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|endoftext|>"])
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<unk>"])
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|startoftext|>"])
special_vocab.add_to_gguf(self.gguf_writer)
@ModelBase.register("DotsOCRForCausalLM")
class DotsOCRVisionModel(MmprojModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
assert self.hparams_vision is not None
self.hparams_vision["image_size"] = 0 # dynamic resolution
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.DOTSOCR)
self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
self.gguf_writer.add_vision_attention_layernorm_eps(self.find_vparam(["rms_norm_eps"]))
self.gguf_writer.add_vision_projector_scale_factor(self.find_vparam(["spatial_merge_size"]))
self.gguf_writer.add_vision_use_silu(True)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.startswith("vision_tower."):
if "vision_tower.blocks." in name and ".mlp." in name:
# note: to avoid naming conflicts in tensor_mapping.py, we need to handle FFN renaming here
# x = F.silu(self.fc1(x)) * self.fc3(x)
# x = self.fc2(x)
# fc1 -> gate, fc2 -> down, fc3 -> up
# mapping original names to Qwen2.5 naming scheme
name = name.replace("vision_tower.blocks.", "visual.blocks.")
name = name.replace(".fc1", ".gate_proj")
name = name.replace(".fc2", ".down_proj")
name = name.replace(".fc3", ".up_proj")
yield from super().modify_tensors(data_torch, name, bid)
###### CONVERSION LOGIC ######
@@ -13110,12 +12994,6 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st
# For non-hf Mamba and Mamba2 models
arch = hparams["ssm_cfg"].get("layer", "Mamba") + "ForCausalLM"
# Step3-VL keeps text config under text_config but uses a custom top-level architecture.
# For text conversion we route to a dedicated text-only class.
# TODO: refactor this later to avoid adding exception here
if model_type == ModelType.TEXT and arch == "StepVLForConditionalGeneration":
return arch
# if "architectures" is found in the sub-config, use that instead
if model_type == ModelType.TEXT and text_config.get("architectures") is not None:
arch = text_config["architectures"][0]

View File

@@ -296,7 +296,7 @@ for model in [*pre_computed_hashes, *all_models]:
except Exception as e:
raise OSError(f"Error loading tokenizer for model {name}.") from e
chktok = tokenizer.encode(CHK_TXT) # ty: ignore[unresolved-attribute]
chktok = tokenizer.encode(CHK_TXT)
chkhsh = sha256(str(chktok).encode()).hexdigest()
logger.info(f"model: {name}")
@@ -468,7 +468,7 @@ for model in models:
with open(f"models/ggml-vocab-{name}.gguf.out", "w") as f:
for text in tests:
res = tokenizer.encode(text, add_special_tokens=False) # ty: ignore[unresolved-attribute]
res = tokenizer.encode(text, add_special_tokens=False)
for r in res:
f.write(f" {r}")
f.write("\n")

View File

@@ -402,7 +402,7 @@ if __name__ == '__main__':
# the invocation string includes the "<|start_of_turn|>"
# token, but the adapters themselves were trained to
# activate _after_ that first token, so we drop it here.
alora_invocation_tokens = tokenizer(invocation_string)["input_ids"][1:] # ty: ignore[call-non-callable]
alora_invocation_tokens = tokenizer(invocation_string)["input_ids"][1:]
if alora_invocation_tokens:
logger.debug("GGUF KV: %s = %s", gguf.Keys.Adapter.ALORA_INVOCATION_TOKENS, alora_invocation_tokens)
self.gguf_writer.add_key_value(

View File

@@ -37,7 +37,6 @@ llama-server -hf ggml-org/gemma-3-4b-it-GGUF --no-mmproj-offload
> - PaddleOCR-VL: https://github.com/ggml-org/llama.cpp/pull/18825
> - GLM-OCR: https://github.com/ggml-org/llama.cpp/pull/19677
> - Deepseek-OCR: https://github.com/ggml-org/llama.cpp/pull/17400
> - Dots.OCR: https://github.com/ggml-org/llama.cpp/pull/17575
> - HunyuanOCR: https://github.com/ggml-org/llama.cpp/pull/21395
## Pre-quantized models

View File

@@ -9,7 +9,6 @@
#include <vector>
#include <filesystem>
#include <fstream>
#include <optional>
#include <regex>
static void print_usage(int /*argc*/, char ** argv) {
@@ -223,10 +222,7 @@ int main(int argc, char ** argv) {
llama_backend_init();
llama_numa_init(params.numa);
std::optional<base_callback_data> cb_data;
if (!params.save_logits) {
cb_data.emplace(params, params.tensor_filter);
}
base_callback_data cb_data(params, params.tensor_filter);
auto llama_init = common_init_from_params(params);

View File

@@ -53,10 +53,10 @@ model_name = os.path.basename(model_path)
print(f"Model name: {model_name}")
prompt = "Hello world today"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids # ty: ignore[call-non-callable]
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
print(f"Input tokens: {input_ids}")
print(f"Input text: {repr(prompt)}")
print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}") # ty: ignore[unresolved-attribute]
print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}")
with torch.no_grad():
outputs = model(input_ids, output_hidden_states=True)
@@ -92,7 +92,7 @@ with torch.no_grad():
# Print embeddings per token in the requested format
print("\nToken embeddings:")
tokens = tokenizer.convert_ids_to_tokens(input_ids[0]) # ty: ignore[unresolved-attribute]
tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
for i, embedding in enumerate(token_embeddings):
# Format: show first few values, ..., then last few values
if len(embedding) > 10:

View File

@@ -207,8 +207,8 @@ def main():
else:
model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True)
encoded = tokenizer(prompt, return_tensors="pt") # ty: ignore[call-non-callable]
tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0]) # ty: ignore[unresolved-attribute]
encoded = tokenizer(prompt, return_tensors="pt")
tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])
n_tokens = len(tokens)
print(f"n_tokens: {n_tokens}");
print(f"hidden_size: {model.config.hidden_size}")

View File

@@ -7,8 +7,6 @@ set(GGML_VERSION_MINOR 9)
set(GGML_VERSION_PATCH 11)
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
if(GIT_EXE)
# Get current git commit hash
@@ -206,14 +204,12 @@ option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM"
option(GGML_CUDA_FA "ggml: compile ggml FlashAttention CUDA kernels" ON)
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
option(GGML_CUDA_NCCL "ggml: use NVIDIA Collective Comm. Library" ON)
set (GGML_CUDA_COMPRESSION_MODE "size" CACHE STRING
"ggml: cuda link binary compression mode; requires cuda 12.8+")
set_property(CACHE GGML_CUDA_COMPRESSION_MODE PROPERTY STRINGS "none;speed;balance;size")
option(GGML_HIP "ggml: use HIP" OFF)
option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
option(GGML_HIP_RCCL "ggml: use ROCm Collective Comm. Library" OFF)
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
option(GGML_HIP_MMQ_MFMA "ggml: enable MFMA MMA for CDNA in MMQ" ON)
@@ -227,6 +223,7 @@ option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug ou
option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF)
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
option(GGML_VULKAN_COPY_TESTS "ggml: run Vulkan cross-device copy benchmarks" OFF)
option(GGML_WEBGPU "ggml: use WebGPU" OFF)
option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF)
option(GGML_WEBGPU_CPU_PROFILE "ggml: enable WebGPU profiling (CPU)" OFF)

View File

@@ -1,36 +0,0 @@
# cmake/FindNCCL.cmake
# NVIDIA does not distribute CMake files with NCCl, therefore use this file to find it instead.
find_path(NCCL_INCLUDE_DIR
NAMES nccl.h
HINTS ${NCCL_ROOT} $ENV{NCCL_ROOT} $ENV{CUDA_HOME} /usr/local/cuda
PATH_SUFFIXES include
)
find_library(NCCL_LIBRARY
NAMES nccl
HINTS ${NCCL_ROOT} $ENV{NCCL_ROOT} $ENV{CUDA_HOME} /usr/local/cuda
PATH_SUFFIXES lib lib64
)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(NCCL
DEFAULT_MSG
NCCL_LIBRARY NCCL_INCLUDE_DIR
)
if(NCCL_FOUND)
set(NCCL_LIBRARIES ${NCCL_LIBRARY})
set(NCCL_INCLUDE_DIRS ${NCCL_INCLUDE_DIR})
if(NOT TARGET NCCL::NCCL)
add_library(NCCL::NCCL UNKNOWN IMPORTED)
set_target_properties(NCCL::NCCL PROPERTIES
IMPORTED_LOCATION "${NCCL_LIBRARY}"
INTERFACE_INCLUDE_DIRECTORIES "${NCCL_INCLUDE_DIR}"
)
endif()
endif()
mark_as_advanced(NCCL_INCLUDE_DIR NCCL_LIBRARY)

View File

@@ -68,7 +68,7 @@ extern "C" {
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
// tensor copy between different backends
GGML_API void ggml_backend_tensor_copy(const struct ggml_tensor * src, struct ggml_tensor * dst);
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
//
// Backend (stream)
@@ -83,17 +83,13 @@ extern "C" {
GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
GGML_API size_t ggml_backend_get_max_size(ggml_backend_t backend);
GGML_API void ggml_backend_tensor_set_async (ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get_async (ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_set_2d_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
GGML_API void ggml_backend_tensor_get_2d_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
// "offset" refers to the offset in tensor->data for setting/getting data
GGML_API void ggml_backend_tensor_set ( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get (const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_set_2d( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
GGML_API void ggml_backend_tensor_get_2d(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
GGML_API void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
@@ -113,7 +109,7 @@ extern "C" {
// the copy is performed after all the currently queued operations in backend_src
// backend_dst will wait for the copy to complete before performing other operations
// automatic fallback to sync copy if async is not supported
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend);
@@ -139,9 +135,7 @@ extern "C" {
// integrated GPU device using host memory
GGML_BACKEND_DEVICE_TYPE_IGPU,
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
GGML_BACKEND_DEVICE_TYPE_ACCEL,
// "meta" device wrapping multiple other devices for tensor parallelism
GGML_BACKEND_DEVICE_TYPE_META,
GGML_BACKEND_DEVICE_TYPE_ACCEL
};
// functionality supported by the device
@@ -202,9 +196,7 @@ extern "C" {
// Common functions that may be obtained using ggml_backend_reg_get_proc_address
// AllReduce operation for tensor parallelism (meta backend)
typedef bool (*ggml_backend_allreduce_tensor_t)(ggml_backend_t * backends, struct ggml_tensor ** tensors, size_t n_backends);
// Split buffer type for tensor parallelism (old)
// Split buffer type for tensor parallelism
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
// Set the number of threads for the backend
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);

View File

@@ -27,9 +27,6 @@ GGML_BACKEND_API bool ggml_backend_is_cuda(ggml_backend_t backend);
// device buffer
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
// conduct allreduce operation between devices
GGML_BACKEND_API bool ggml_backend_cuda_allreduce_tensor(ggml_backend_t * backends, struct ggml_tensor ** tensors, size_t n_backends);
// split tensor buffer that splits matrices by rows across multiple devices
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);

View File

@@ -200,7 +200,6 @@ add_library(ggml-base
ggml.cpp
ggml-alloc.c
ggml-backend.cpp
ggml-backend-meta.cpp
ggml-opt.cpp
ggml-threading.cpp
ggml-threading.h

View File

@@ -1236,9 +1236,6 @@ size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx,
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
size_t nbytes_total = 0;
if (ggml_backend_buft_is_meta(buft)) {
return ggml_backend_meta_alloc_ctx_tensors_from_buft(ctx, buft);
}
return ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc =*/ false);
}

View File

@@ -49,10 +49,6 @@ extern "C" {
void (*memset_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
// (optional) 2d data copies
void (*set_tensor_2d)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
void (*get_tensor_2d)(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
// (optional) tensor copy: dst is in the buffer, src may be in any buffer, including buffers from a different backend (return false if not supported)
bool (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst);
// clear the entire buffer
@@ -84,20 +80,6 @@ extern "C" {
GGML_API bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
//
// Backend (meta)
//
GGML_API bool ggml_backend_is_meta (ggml_backend_t backend);
GGML_API bool ggml_backend_buffer_is_meta(ggml_backend_buffer_t buf);
GGML_API bool ggml_backend_buft_is_meta (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_meta_n_backends (ggml_backend_t meta_backend);
GGML_API ggml_backend_t ggml_backend_meta_simple_backend(ggml_backend_t meta_backend, size_t index);
// temporary workaround to statically allocate tensors from a context in a deduplicated way:
GGML_API struct ggml_backend_buffer * ggml_backend_meta_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
//
// Backend (stream)
//
@@ -108,10 +90,8 @@ extern "C" {
void (*free)(ggml_backend_t backend);
// (optional) asynchronous tensor data access
void (*set_tensor_async) (ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*get_tensor_async) (ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
void (*set_tensor_2d_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
void (*get_tensor_2d_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
// (optional) complete all pending operations (required if the backend supports async operations)

File diff suppressed because it is too large Load Diff

View File

@@ -123,7 +123,7 @@ size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
GGML_ASSERT(buffer);
// get_base is optional if the buffer is zero-sized
if (!ggml_backend_buffer_is_meta(buffer) && buffer->size == 0) {
if (buffer->size == 0) {
return NULL;
}
@@ -279,57 +279,15 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
}
}
void ggml_backend_tensor_set_2d_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size,
size_t n_copies, size_t stride_tensor, size_t stride_data) {
GGML_ASSERT(backend);
GGML_ASSERT(tensor);
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
if (n_copies <= 1 || backend->iface.set_tensor_2d_async == NULL) {
for (size_t i = 0; i < n_copies; i++) {
ggml_backend_tensor_set_async(backend, tensor, (const char *) data + i*stride_data, offset + i*stride_tensor, size);
}
return;
}
if (size == 0) {
return;
}
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + (n_copies-1)*stride_tensor + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
backend->iface.set_tensor_2d_async(backend, tensor, data, offset, size, n_copies, stride_tensor, stride_data);
}
void ggml_backend_tensor_get_2d_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size,
size_t n_copies, size_t stride_tensor, size_t stride_data) {
GGML_ASSERT(backend);
GGML_ASSERT(tensor);
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
if (n_copies <= 1 || backend->iface.set_tensor_2d_async == NULL) {
for (size_t i = 0; i < n_copies; i++) {
ggml_backend_tensor_get_async(backend, tensor, (char *) data + i*stride_data, offset + i*stride_tensor, size);
}
return;
}
if (size == 0) {
return;
}
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + (n_copies-1)*stride_tensor + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
backend->iface.get_tensor_2d_async(backend, tensor, data, offset, size, n_copies, stride_tensor, stride_data);
}
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
GGML_ASSERT(tensor);
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
GGML_ASSERT(buf != NULL && "tensor buffer not set");
if (size == 0) {
return;
}
GGML_ASSERT(buf != NULL && "tensor buffer not set");
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
@@ -339,62 +297,18 @@ void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, siz
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
GGML_ASSERT(tensor);
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
GGML_ASSERT(buf != NULL && "tensor buffer not set");
if (size == 0) {
return;
}
GGML_ASSERT(buf != NULL && "tensor buffer not set");
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
buf->iface.get_tensor(buf, tensor, data, offset, size);
}
void ggml_backend_tensor_set_2d(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size,
size_t n_copies, size_t stride_tensor, size_t stride_data) {
GGML_ASSERT(tensor);
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
GGML_ASSERT(buf != NULL && "tensor buffer not set");
if (n_copies <= 1 || buf->iface.set_tensor_2d == NULL) {
for (size_t i = 0; i < n_copies; i++) {
ggml_backend_tensor_set(tensor, (const char *) data + i*stride_data, offset + i*stride_tensor, size);
}
return;
}
if (size == 0) {
return;
}
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + (n_copies-1)*stride_tensor + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
buf->iface.set_tensor_2d(buf, tensor, data, offset, size, n_copies, stride_tensor, stride_data);
}
void ggml_backend_tensor_get_2d(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size,
size_t n_copies, size_t stride_tensor, size_t stride_data) {
GGML_ASSERT(tensor);
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
GGML_ASSERT(buf != NULL && "tensor buffer not set");
if (n_copies <= 1 || buf->iface.set_tensor_2d == NULL) {
for (size_t i = 0; i < n_copies; i++) {
ggml_backend_tensor_get(tensor, (char *) data + i*stride_data, offset + i*stride_tensor, size);
}
return;
}
if (size == 0) {
return;
}
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + (n_copies-1)*stride_tensor + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
buf->iface.get_tensor_2d(buf, tensor, data, offset, size, n_copies, stride_tensor, stride_data);
}
void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
GGML_ASSERT(tensor);
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
@@ -474,7 +388,7 @@ ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
// backend copy
void ggml_backend_tensor_copy(const struct ggml_tensor * src, struct ggml_tensor * dst) {
void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
if (src == dst) {
@@ -488,7 +402,7 @@ void ggml_backend_tensor_copy(const struct ggml_tensor * src, struct ggml_tensor
} else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
#endif // NDEBUG
#endif
size_t nbytes = ggml_nbytes(src);
void * data = malloc(nbytes);
ggml_backend_tensor_get(src, data, 0, nbytes);
@@ -497,7 +411,7 @@ void ggml_backend_tensor_copy(const struct ggml_tensor * src, struct ggml_tensor
}
}
void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst) {
void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst) {
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
if (src == dst) {
@@ -586,7 +500,6 @@ enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
}
void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) {
GGML_ASSERT(device);
memset(props, 0, sizeof(*props));
device->iface.get_props(device, props);
}
@@ -697,8 +610,6 @@ static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = {
/* .memset_tensor = */ NULL,
/* .set_tensor = */ NULL,
/* .get_tensor = */ NULL,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ NULL,
/* .clear = */ ggml_backend_multi_buffer_clear,
/* .reset = */ NULL,
@@ -1988,9 +1899,8 @@ enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct
GGML_ASSERT(tensor->data == NULL);
GGML_ASSERT(tensor->view_src == NULL);
GGML_ASSERT(addr >= ggml_backend_buffer_get_base(buffer));
GGML_ASSERT(ggml_backend_buffer_is_meta(buffer) ||
(char *) addr + ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
(char *) ggml_backend_buffer_get_base(buffer) + ggml_backend_buffer_get_size(buffer));
GGML_ASSERT((char *)addr + ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
(char *)ggml_backend_buffer_get_base(buffer) + ggml_backend_buffer_get_size(buffer));
tensor->buffer = buffer;
tensor->data = addr;
@@ -2264,8 +2174,6 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
/* .clear = */ ggml_backend_cpu_buffer_clear,
/* .reset = */ NULL,
@@ -2278,8 +2186,6 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
/* .clear = */ ggml_backend_cpu_buffer_clear,
/* .reset = */ NULL,

View File

@@ -262,8 +262,6 @@ static struct ggml_backend_i blas_backend_i = {
/* .get_name = */ ggml_backend_blas_get_name,
/* .free = */ ggml_backend_blas_free,
/* .set_tensor_async = */ NULL,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .synchronize = */ NULL,

View File

@@ -1457,8 +1457,6 @@ static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
/* .memset_tensor = */ NULL,
/* .set_tensor = */ ggml_backend_cann_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_cann_buffer_get_tensor,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ ggml_backend_cann_buffer_cpy_tensor,
/* .clear = */ ggml_backend_cann_buffer_clear,
/* .reset = */ NULL,
@@ -2700,8 +2698,6 @@ static const ggml_backend_i ggml_backend_cann_interface = {
/* .free = */ ggml_backend_cann_free,
/* .set_tensor_async = */ ggml_backend_cann_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_cann_get_tensor_async,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ ggml_backend_cann_cpy_tensor_async,
/* .synchronize = */ ggml_backend_cann_synchronize,
/* .graph_plan_create = */ NULL,

View File

@@ -111,8 +111,6 @@ static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
/* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_amx_buffer_set_tensor,
/* .get_tensor = */ nullptr,
/* .set_tensor_2d = */ nullptr,
/* .get_tensor_2d = */ nullptr,
/* .cpy_tensor = */ nullptr,
/* .clear = */ ggml_backend_amx_buffer_clear,
/* .reset = */ nullptr,

View File

@@ -195,8 +195,6 @@ static const struct ggml_backend_i ggml_backend_cpu_i = {
/* .free = */ ggml_backend_cpu_free,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,

View File

@@ -181,16 +181,6 @@ if (CUDAToolkit_FOUND)
target_link_libraries(ggml-cuda PRIVATE CUDA::cuda_driver)
endif()
if (GGML_CUDA_NCCL)
find_package(NCCL)
if (NCCL_FOUND)
add_compile_definitions(GGML_USE_NCCL)
target_link_libraries(ggml-cuda PRIVATE NCCL::NCCL)
else()
message(STATUS "Warning: NCCL not found, performance for multiple CUDA GPUs will be suboptimal")
endif()
endif()
set(CUDA_CXX_FLAGS "")
set(CUDA_FLAGS -use_fast_math -extended-lambda)

View File

@@ -60,24 +60,24 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
if (order == GGML_SORT_ORDER_ASC) {
if (nrows == 1) {
CUDA_CHECK(DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place)
DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place)
temp_indices, dst, // values (indices)
ncols, 0, sizeof(float) * 8, stream));
ncols, 0, sizeof(float) * 8, stream);
} else {
CUDA_CHECK(DeviceSegmentedSort::SortPairs(nullptr, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place)
DeviceSegmentedSort::SortPairs(nullptr, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place)
temp_indices, dst, // values (indices)
ncols * nrows, nrows, // num items, num segments
offset_iterator, offset_iterator + 1, stream));
offset_iterator, offset_iterator + 1, stream);
}
} else {
if (nrows == 1) {
CUDA_CHECK(DeviceRadixSort::SortPairsDescending(nullptr, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place)
DeviceRadixSort::SortPairsDescending(nullptr, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place)
temp_indices, dst, // values (indices)
ncols, 0, sizeof(float) * 8, stream));
ncols, 0, sizeof(float) * 8, stream);
} else {
CUDA_CHECK(DeviceSegmentedSort::SortPairsDescending(nullptr, temp_storage_bytes, temp_keys, temp_keys, temp_indices,
DeviceSegmentedSort::SortPairsDescending(nullptr, temp_storage_bytes, temp_keys, temp_keys, temp_indices,
dst, ncols * nrows, nrows, offset_iterator, offset_iterator + 1,
stream));
stream);
}
}
@@ -86,22 +86,22 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
if (order == GGML_SORT_ORDER_ASC) {
if (nrows == 1) {
CUDA_CHECK(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place)
DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place)
temp_indices, dst, // values (indices)
ncols, 0, sizeof(float) * 8, stream));
ncols, 0, sizeof(float) * 8, stream);
} else {
CUDA_CHECK(DeviceSegmentedSort::SortPairs(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, temp_indices, dst,
ncols * nrows, nrows, offset_iterator, offset_iterator + 1, stream));
DeviceSegmentedSort::SortPairs(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, temp_indices, dst,
ncols * nrows, nrows, offset_iterator, offset_iterator + 1, stream);
}
} else {
if (nrows == 1) {
CUDA_CHECK(DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place)
DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place)
temp_indices, dst, // values (indices)
ncols, 0, sizeof(float) * 8, stream));
ncols, 0, sizeof(float) * 8, stream);
} else {
CUDA_CHECK(DeviceSegmentedSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys,
DeviceSegmentedSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys,
temp_indices, dst, ncols * nrows, nrows, offset_iterator,
offset_iterator + 1, stream));
offset_iterator + 1, stream);
}
}
}

View File

@@ -67,7 +67,6 @@
#define GGML_CUDA_CC_CDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x90a) // MI210 (gfx90a), minimum acc register renaming
#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300
#define GGML_CUDA_CC_CDNA4 (GGML_CUDA_CC_OFFSET_AMD + 0x950) // MI350X/MI355X
// RDNA removes MFMA, dp4a, xnack, acc registers, wave size is 32
#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
@@ -88,8 +87,7 @@
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
#define GGML_CUDA_CC_IS_CDNA1(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_CDNA2)
#define GGML_CUDA_CC_IS_CDNA2(cc) (cc >= GGML_CUDA_CC_CDNA2 && cc < GGML_CUDA_CC_CDNA3)
#define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_CDNA4)
#define GGML_CUDA_CC_IS_CDNA4(cc) (cc >= GGML_CUDA_CC_CDNA4 && cc < GGML_CUDA_CC_RDNA1)
#define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)
// Moore Threads
#define MUSART_HMASK 40300 // MUSA rc4.3, min. ver. for half2 -> uint mask comparisons
@@ -188,10 +186,6 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
#ifdef GGML_USE_NCCL
#define NCCL_CHECK(err) CUDA_CHECK_GEN(err, ncclSuccess, ncclGetErrorString)
#endif // GGML_USE_NCCL
#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
static const char * cu_get_error_str(CUresult err) {
const char * err_str;
@@ -1092,10 +1086,6 @@ struct ggml_cuda_device_info {
cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {};
std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {};
#ifdef GGML_USE_NCCL
ncclComm_t comms[GGML_CUDA_MAX_DEVICES];
#endif // GGML_USE_NCCL
};
const ggml_cuda_device_info & ggml_cuda_info();
@@ -1183,11 +1173,7 @@ struct ggml_cuda_graph {
std::vector<cudaGraphNode_t> nodes;
bool disable_due_to_gpu_arch = false;
bool warmup_complete = false;
struct node_properties {
ggml_tensor node;
void * node_src_data_ptrs[GGML_MAX_SRC];
};
std::vector<node_properties> node_props;
std::vector<ggml_tensor> nodes_copy;
bool is_enabled() const {
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);

View File

@@ -324,28 +324,6 @@ static ggml_cuda_device_info ggml_cuda_init() {
// configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
for (int id = 0; id < info.device_count; ++id) {
ggml_cuda_set_device(id);
for (int id_other = 0; id_other < info.device_count; ++id_other) {
if (id == id_other) {
continue;
}
int can_access_peer;
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
if (can_access_peer) {
CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
}
}
}
#ifdef GGML_USE_NCCL
int dev_ids[GGML_CUDA_MAX_DEVICES];
for (int id = 0; id < info.device_count; ++id) {
dev_ids[id] = id;
}
NCCL_CHECK(ncclCommInitAll(info.comms, info.device_count, dev_ids));
#endif // GGML_USE_NCCL
return info;
}
@@ -654,46 +632,26 @@ static enum ggml_status ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer
}
static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *) buffer->context;
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
ggml_cuda_set_device(ctx->device);
CUDA_CHECK(cudaMemsetAsync((char *) tensor->data + offset, value, size, cudaStreamPerThread));
CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + offset, value, size, cudaStreamPerThread));
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
}
static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *) buffer->context;
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
ggml_cuda_set_device(ctx->device);
CUDA_CHECK(cudaMemcpyAsync((char *) tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
}
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *) buffer->context;
ggml_cuda_set_device(ctx->device);
CUDA_CHECK(cudaMemcpyAsync(data, (const char *) tensor->data + offset, size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
}
static void ggml_backend_cuda_buffer_set_tensor_2d(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data,
size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *) buffer->context;
ggml_cuda_set_device(ctx->device);
CUDA_CHECK(cudaMemcpy2DAsync(
(char *) tensor->data + offset, stride_tensor, data, stride_data, size, n_copies, cudaMemcpyHostToDevice, cudaStreamPerThread));
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
}
static void ggml_backend_cuda_buffer_get_tensor_2d(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data,
size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
ggml_cuda_set_device(ctx->device);
CUDA_CHECK(cudaMemcpy2DAsync(
data, stride_data, (const char *) tensor->data + offset, stride_tensor, size, n_copies, cudaMemcpyDeviceToHost, cudaStreamPerThread));
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
}
@@ -733,8 +691,6 @@ static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
/* .memset_tensor = */ ggml_backend_cuda_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
/* .set_tensor_2d = */ ggml_backend_cuda_buffer_set_tensor_2d,
/* .get_tensor_2d = */ ggml_backend_cuda_buffer_get_tensor_2d,
/* .cpy_tensor = */ ggml_backend_cuda_buffer_cpy_tensor,
/* .clear = */ ggml_backend_cuda_buffer_clear,
/* .reset = */ NULL,
@@ -1047,8 +1003,6 @@ static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
/* .memset_tensor = */ NULL,
/* .set_tensor = */ ggml_backend_cuda_split_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_cuda_split_buffer_get_tensor,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ NULL,
/* .clear = */ ggml_backend_cuda_split_buffer_clear,
/* .reset = */ NULL,
@@ -1125,83 +1079,6 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_inte
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
};
bool ggml_backend_cuda_allreduce_tensor(ggml_backend_t * backends, struct ggml_tensor ** tensors, size_t n_backends) {
#ifdef GGML_USE_NCCL
const int64_t ne = ggml_nelements(tensors[0]);
// FIXME the input of llm_graph_context::build_in_out_ids can produce a tensor with 0 elements if n_outputs == 0
// This then causes a crash in this function
if (ne == 0) {
return true;
}
for (size_t i = 0; i < n_backends; ++i) {
GGML_ASSERT(tensors[i] != nullptr);
GGML_ASSERT(ggml_nelements(tensors[i]) == ne);
GGML_ASSERT(ggml_is_contiguously_allocated(tensors[i]));
}
const ggml_cuda_device_info info = ggml_cuda_info();
// For small tensors, simply reduce them as FP32.
// The following heuristic for how "small" a tensor should be is based on RTX 4090s connected via 16x PCIe 4.0.
if ((n_backends <= 2 && ne < 32768) || (n_backends == 3 && ne < 131072) || (n_backends >= 4 && ne < 262144)) {
NCCL_CHECK(ncclGroupStart());
for (size_t i = 0; i < n_backends; ++i) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
NCCL_CHECK(ncclAllReduce(tensors[i]->data, tensors[i]->data, ne, ncclFloat, ncclSum, info.comms[cuda_ctx->device], cuda_ctx->stream()));
}
NCCL_CHECK(ncclGroupEnd());
return true;
}
// For large tensors it's faster to compress them to BF16 for the reduction:
to_bf16_cuda_t to_bf16 = ggml_get_to_bf16_cuda(GGML_TYPE_F32);
to_fp32_cuda_t to_fp32 = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
ggml_cuda_pool_alloc<nv_bfloat16> tmp[GGML_CUDA_MAX_DEVICES];
for (size_t i = 0; i < n_backends; ++i) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
tmp[i].pool = &cuda_ctx->pool();
tmp[i].alloc(ne);
ggml_cuda_set_device(i);
to_bf16(tensors[i]->data, tmp[i].get(), ne, cuda_ctx->stream());
CUDA_CHECK(cudaGetLastError());
}
NCCL_CHECK(ncclGroupStart());
for (size_t i = 0; i < n_backends; ++i) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
NCCL_CHECK(ncclAllReduce(tmp[i].get(), tmp[i].get(), ne, ncclBfloat16, ncclSum, info.comms[cuda_ctx->device], cuda_ctx->stream()));
}
NCCL_CHECK(ncclGroupEnd());
for (size_t i = 0; i < n_backends; ++i) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
ggml_cuda_set_device(i);
to_fp32(tmp[i].get(), (float *) tensors[i]->data, ne, cuda_ctx->stream());
CUDA_CHECK(cudaGetLastError());
}
return true;
#else
// If NCCL is installed it is used by default for optimal performance.
// However, NVIDIA does not distribute NCCL with CUDA so users may be unwittingly missing this package.
// RCCL is disabled by default, users are explicitly opting in.
// Therefore print no warning for RCCL.
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
static bool warning_printed = false;
if (!warning_printed) {
GGML_LOG_WARN("%s: NVIDIA Collective Communications Library (NCCL) is unavailable, multi GPU performance will be suboptimal\n", __func__);
warning_printed = true;
}
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
GGML_UNUSED_VARS(backends, tensors, n_backends);
return false;
#endif // GGML_USE_NCCL
}
ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split) {
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
@@ -1548,6 +1425,64 @@ static void ggml_cuda_op_mul_mat_cublas(
GGML_UNUSED_VARS(dst, src1_ddq_i, src1_padded_row_size);
}
static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
static bool peer_access_enabled = false;
const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
if (peer_access_enabled == enable_peer_access) {
return;
}
#ifdef NDEBUG
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
ggml_cuda_set_device(id);
CUDA_CHECK(cudaDeviceSynchronize());
}
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
ggml_cuda_set_device(id);
for (int id_other = 0; id_other < ggml_backend_cuda_get_device_count(); ++id_other) {
if (id == id_other) {
continue;
}
if (id != main_device && id_other != main_device) {
continue;
}
int can_access_peer;
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
if (can_access_peer) {
if (enable_peer_access) {
cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0);
if (err != cudaErrorPeerAccessAlreadyEnabled) {
CUDA_CHECK(err);
} else {
// reset the error
(void)cudaGetLastError();
}
} else {
cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
if (err != cudaErrorPeerAccessNotEnabled) {
CUDA_CHECK(err);
} else {
// reset the error
(void)cudaGetLastError();
}
}
}
}
}
ggml_cuda_set_device(main_device);
#endif // NDEBUG
peer_access_enabled = enable_peer_access;
GGML_UNUSED(main_device);
}
static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
@@ -2548,6 +2483,11 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
}
static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
// why is this here instead of mul_mat?
if (dst->src[0] != nullptr && ggml_backend_buft_is_cuda_split(dst->src[0]->buffer->buft)) {
ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device);
}
switch (dst->op) {
case GGML_OP_ARGMAX:
ggml_cuda_argmax(ctx, dst);
@@ -2905,43 +2845,21 @@ static void ggml_backend_cuda_free(ggml_backend_t backend) {
}
static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
CUDA_CHECK(cudaMemcpyAsync((char *) tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream()));
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream()));
}
static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
CUDA_CHECK(cudaMemcpyAsync(data, (const char *) tensor->data + offset, size, cudaMemcpyDeviceToHost, cuda_ctx->stream()));
}
static void ggml_backend_cuda_set_tensor_2d_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data,
size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
CUDA_CHECK(cudaMemcpy2DAsync(
(char *) tensor->data + offset, stride_tensor, data, stride_data, size, n_copies, cudaMemcpyHostToDevice, cuda_ctx->stream()));
}
static void ggml_backend_cuda_get_tensor_2d_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data,
size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
CUDA_CHECK(cudaMemcpy2DAsync(
data, stride_data, (const char *) tensor->data + offset, stride_tensor, size, n_copies, cudaMemcpyDeviceToHost, cuda_ctx->stream()));
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cuda_ctx->stream()));
}
static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
@@ -2952,21 +2870,21 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
return false;
}
if (!ggml_backend_buffer_is_cuda(buf_src) || !ggml_backend_buffer_is_cuda(buf_dst)) {
if (!ggml_backend_buffer_is_cuda(src->buffer) || !ggml_backend_buffer_is_cuda(dst->buffer)) {
return false;
}
// device -> device copy
ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *) backend_src->context;
ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *) backend_dst->context;
ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *) buf_src->context;
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *) buf_dst->context;
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__);
#endif // NDEBUG
#endif
return false;
}
@@ -2979,7 +2897,7 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
return false;
#else
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
#endif // GGML_CUDA_NO_PEER_COPY
#endif
}
// record event on src stream after the copy
@@ -3061,25 +2979,18 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx
ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key);
// Check if the graph size has changed
if ((int)graph->node_props.size() != cgraph->n_nodes) {
if ((int)graph->nodes_copy.size() != cgraph->n_nodes) {
res = true;
graph->node_props.resize(cgraph->n_nodes);
graph->nodes_copy.resize(cgraph->n_nodes);
}
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_cuda_graph::node_properties prop = {};
memcpy(&prop.node, cgraph->nodes[i], sizeof(ggml_tensor));
// if the backend scheduler is making copies of CPU tensors, the src pointers can be the same but with different data, see:
// https://github.com/ggml-org/llama.cpp/pull/21472#discussion_r3052235188
for (int j = 0; j < GGML_MAX_SRC; ++j) {
prop.node_src_data_ptrs[j] = cgraph->nodes[i]->src[j] ? cgraph->nodes[i]->src[j]->data : nullptr;
if (!res) {
if (memcmp(&graph->nodes_copy[i], cgraph->nodes[i], sizeof(ggml_tensor)) != 0) {
res = true;
}
}
if (!res && memcmp(&graph->node_props[i], &prop, sizeof(prop)) != 0) {
res = true;
}
graph->node_props[i] = prop;
memcpy(&graph->nodes_copy[i], cgraph->nodes[i], sizeof(ggml_tensor));
}
return res;
@@ -4425,8 +4336,6 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
/* .free = */ ggml_backend_cuda_free,
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
/* .get_tensor_2d_async = */ ggml_backend_cuda_set_tensor_2d_async,
/* .set_tensor_2d_async = */ ggml_backend_cuda_get_tensor_2d_async,
/* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
/* .synchronize = */ ggml_backend_cuda_synchronize,
/* .graph_plan_create = */ NULL,
@@ -5214,9 +5123,6 @@ static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t
static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
GGML_UNUSED(reg);
if (strcmp(name, "ggml_backend_allreduce_tensor") == 0) {
return (void *)ggml_backend_cuda_allreduce_tensor;
}
if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
return (void *)ggml_backend_cuda_split_buffer_type;
}

View File

@@ -1025,8 +1025,7 @@ namespace ggml_cuda_mma {
const floatx2_t& a_frag = reinterpret_cast<const floatx2_t&>(A.x[0]);
const floatx2_t& b_frag = reinterpret_cast<const floatx2_t&>(B.x[0]);
acc_frag = __builtin_amdgcn_mfma_f32_16x16x8_xf32(a_frag, b_frag, acc_frag, 0, 0, 0);
#elif defined(CDNA4) || defined(CDNA2) || defined(CDNA1)
// CDNA4 (gfx950) does not support xf32 MFMA, use f32 path like CDNA2/CDNA1
#elif defined(CDNA2) || defined(CDNA1)
#pragma unroll
for (int i = 0; i < 2; ++i) {
acc_frag = __builtin_amdgcn_mfma_f32_16x16x4f32(A.x[i], B.x[i], acc_frag, 0, 0, 0);
@@ -1188,7 +1187,7 @@ namespace ggml_cuda_mma {
#elif defined(AMD_MFMA_AVAILABLE)
using floatx4_t = __attribute__((ext_vector_type(4))) float;
floatx4_t& acc_frag = reinterpret_cast<floatx4_t&>(D.x[0]);
#if defined(CDNA4) || defined(CDNA3) || defined(CDNA2)
#if defined(CDNA3) || defined(CDNA2)
using bf16x4_t = __attribute__((ext_vector_type(4))) __bf16;
const bf16x4_t& a_frag = reinterpret_cast<const bf16x4_t&>(A.x[0]);
const bf16x4_t& b_frag = reinterpret_cast<const bf16x4_t&>(B.x[0]);
@@ -1217,12 +1216,12 @@ namespace ggml_cuda_mma {
#if defined(AMD_MFMA_AVAILABLE)
using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
int32x4_t * acc = (int32x4_t *) D.x;
#if defined(CDNA4) || defined(CDNA3)
#if defined(CDNA3)
acc[0] = __builtin_amdgcn_mfma_i32_16x16x32_i8(((int64_t *) A.x)[0],
((int64_t *) B.x)[0],
acc[0],
0, 0, 0);
#elif defined(CDNA2) || defined(CDNA1)
#elif defined(CDNA2) || defined(CDNA)
acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[0],
B.x[0],
acc[0],
@@ -1231,7 +1230,7 @@ namespace ggml_cuda_mma {
B.x[1],
acc[0],
0, 0, 0);
#endif // defined(CDNA4) || defined(CDNA3)
#endif // defined(CDNA3)
#elif defined(AMD_WMMA_AVAILABLE)
@@ -1296,12 +1295,12 @@ namespace ggml_cuda_mma {
#if defined(AMD_MFMA_AVAILABLE)
using int32x16_t = __attribute__((__vector_size__(16 * sizeof(int)))) int;
int32x16_t * acc = (int32x16_t *) D.x;
#if defined(CDNA4) || defined(CDNA3)
#if defined(CDNA3)
acc[0] = __builtin_amdgcn_mfma_i32_32x32x16_i8(((int64_t *) A.x)[0],
((int64_t *) B.x)[0],
acc[0],
0, 0, 0);
#elif defined(CDNA2) || defined(CDNA1)
#elif defined(CDNA2) || defined(CDNA)
acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[0],
B.x[0],
acc[0],
@@ -1310,7 +1309,7 @@ namespace ggml_cuda_mma {
B.x[1],
acc[0],
0, 0, 0);
#endif // defined(CDNA4) || defined(CDNA3)
#endif // defined(CDNA3)
#else
GGML_UNUSED_VARS(D, A, B);

View File

@@ -3645,7 +3645,7 @@ static __global__ void mul_mat_q(
tile_x_max_i, tile_y_max_j, 0, ncols_x/qk);
return;
}
#endif // (defined(GGML_USE_HIP) && !defined(CDNA4) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
#endif // (defined(GGML_USE_HIP) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
constexpr int ITER_K = get_iter_k(type);

View File

@@ -25,14 +25,14 @@ static void top_k_cub(ggml_cuda_pool & pool,
auto indexes_in = cuda::make_counting_iterator(0);
size_t temp_storage_bytes = 0;
CUDA_CHECK(DeviceTopK::MaxPairs(nullptr, temp_storage_bytes, src, cuda::discard_iterator(), indexes_in, dst, ncols, k,
env));
DeviceTopK::MaxPairs(nullptr, temp_storage_bytes, src, cuda::discard_iterator(), indexes_in, dst, ncols, k,
env);
ggml_cuda_pool_alloc<uint8_t> temp_storage_alloc(pool, temp_storage_bytes);
void * d_temp_storage = temp_storage_alloc.get();
CUDA_CHECK(DeviceTopK::MaxPairs(d_temp_storage, temp_storage_bytes, src, cuda::discard_iterator(), indexes_in, dst,
ncols, k, env));
DeviceTopK::MaxPairs(d_temp_storage, temp_storage_bytes, src, cuda::discard_iterator(), indexes_in, dst,
ncols, k, env);
}
#elif defined(GGML_CUDA_USE_CUB) // CUB_TOP_K_AVAILABLE

View File

@@ -6,10 +6,6 @@
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#ifdef GGML_USE_NCCL
#include <nccl.h>
#endif // GGML_USE_NCCL
#if CUDART_VERSION >= 11080
#include <cuda_fp8.h>
#define FP8_AVAILABLE

View File

@@ -10,11 +10,6 @@
#include <rocwmma/rocwmma-version.hpp>
#endif // defined(GGML_HIP_ROCWMMA_FATTN)
#ifdef GGML_USE_NCCL
#include <rccl/rccl.h>
#endif // GGML_USE_NCCL
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
#define CUBLAS_OP_N HIPBLAS_OP_N
@@ -33,7 +28,6 @@
#define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
#define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
#define NCCL_CHECK(fn) {ncclResult_t err = fn; if(err != ncclSuccess) { GGML_ABORT("RCCL Failure RCCL returned: %i\n", err); }}
#define __shfl_sync(mask, var, laneMask, width) __shfl(var, laneMask, width)
#define __shfl_up_sync(mask, var, laneMask, width) __shfl_up(var, laneMask, width)
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
@@ -189,10 +183,6 @@
#define GCN
#endif // defined(GCN5) || defined(GCN4)
#if defined(__gfx950__)
#define CDNA4
#endif // defined(__gfx950__)
#if defined(__gfx942__)
#define CDNA3
#endif // defined(__gfx942__)
@@ -205,9 +195,9 @@
#define CDNA1
#endif // defined(__gfx908__)
#if defined(CDNA4) || defined(CDNA3) || defined(CDNA2) || defined(CDNA1)
#if defined(CDNA3) || defined(CDNA2) || defined(CDNA1)
#define CDNA // For the entire family
#endif // defined(CDNA4) || defined(CDNA3) || defined(CDNA2) || defined(CDNA1)
#endif // defined(CDNA3) || defined(CDNA2) || defined(CDNA1)
#if defined(__GFX12__)
#define RDNA4

View File

@@ -1,56 +0,0 @@
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
// This is a "staging" header for new ggml API
// It is not publicly available and it should not be used by 3rd party projects
//
// When the API matures enough, it will be moved to the official public API
//
// Meta backend
//
#define GGML_BACKEND_META_MAX_DEVICES 16
enum ggml_backend_meta_split_axis {
// tensor split by tensor dimensions:
GGML_BACKEND_SPLIT_AXIS_0 = 0,
GGML_BACKEND_SPLIT_AXIS_1 = 1,
GGML_BACKEND_SPLIT_AXIS_2 = 2,
GGML_BACKEND_SPLIT_AXIS_3 = 3,
GGML_BACKEND_SPLIT_AXIS_MIRRORED = 10, // all values on all backends
GGML_BACKEND_SPLIT_AXIS_PARTIAL = 11, // each backend has a partial sum
// for internal bookkeeping only:
GGML_BACKEND_SPLIT_AXIS_NONE = 98,
GGML_BACKEND_SPLIT_AXIS_UNKNOWN = 99,
};
GGML_API const char * ggml_backend_meta_split_axis_name(enum ggml_backend_meta_split_axis split_axis);
struct ggml_backend_meta_split_state {
enum ggml_backend_meta_split_axis axis;
// for tensors with axis >= 0 && axis < GGML_MAX_DIMS:
// - each device has a slice of the tensor along the split axis
// - most tensors have n_segments == 1 and a contiguous slice of the tensor data
// - some tensors have an inhomogenenous data layout along the split axis,
// those tensors are divided into segments which are each individually split across devices
// - ne has one entry per segment and device that add up to ggml_tensor::ne for that axis,
// the outer/inner loops are over segments/devices like [seg0_dev0, seg0_dev1, seg1_dev0, seg1_dev1],
// - for example, a transformer may have a fused QKV matrix rather than 3 matrices, those would be 3 separate segments
// that each need to be split individually across devices so that each device gets a slice of Q, K, and V
int64_t ne[16*GGML_BACKEND_META_MAX_DEVICES];
uint32_t n_segments;
};
// function to assign split states for statically allocated tensors, compute tensor split states will be assigned to be compatible:
typedef struct ggml_backend_meta_split_state(*ggml_backend_meta_get_split_state_t)(const struct ggml_tensor * tensor, void * userdata);
// create a new meta device from "simple" devices, meta buffer type/buffer/backend is then derived from this:
// TODO: this looks a bit strange - a backend API creates a device. I think we should try
// express this as a backend registry functionality instead
GGML_API ggml_backend_dev_t ggml_backend_meta_device(
ggml_backend_dev_t * devs, size_t n_devs, ggml_backend_meta_get_split_state_t get_split_state, void * get_split_state_ud);

View File

@@ -1491,8 +1491,6 @@ static ggml_backend_buffer_i ggml_backend_hexagon_buffer_interface = {
/* .memset_tensor = */ NULL,
/* .set_tensor = */ ggml_backend_hexagon_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_hexagon_buffer_get_tensor,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ ggml_backend_hexagon_buffer_cpy_tensor,
/* .clear = */ ggml_backend_hexagon_buffer_clear,
/* .reset = */ NULL,
@@ -3004,8 +3002,6 @@ static struct ggml_backend_i hexagon_backend_i = {
/* .free = */ ggml_backend_hexagon_free,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .synchronize = */ ggml_backend_hexagon_synchronize,
/* .graph_plan_create = */ NULL,

View File

@@ -47,10 +47,6 @@ find_package(hip REQUIRED)
find_package(hipblas REQUIRED)
find_package(rocblas REQUIRED)
if (GGML_HIP_RCCL)
find_package(rccl REQUIRED)
endif()
if (${hip_VERSION} VERSION_LESS 6.1)
message(FATAL_ERROR "At least ROCM/HIP V6.1 is required")
endif()
@@ -122,10 +118,6 @@ if (NOT GGML_HIP_MMQ_MFMA)
add_compile_definitions(GGML_HIP_NO_MMQ_MFMA)
endif()
if (GGML_HIP_RCCL)
add_compile_definitions(GGML_USE_NCCL) # RCCL has the same interface as NCCL.
endif()
if (GGML_HIP_EXPORT_METRICS)
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Rpass-analysis=kernel-resource-usage --save-temps")
endif()
@@ -150,8 +142,4 @@ if (GGML_STATIC)
message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
endif()
if (GGML_HIP_RCCL)
target_link_libraries(ggml-hip PRIVATE ggml-base roc::rccl)
endif()
target_link_libraries(ggml-hip PRIVATE ggml-base hip::host roc::rocblas roc::hipblas)

View File

@@ -736,11 +736,6 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv(ggml_meta
suffix = ne00 % 4 == 0 ? "_4" : "";
}
} break;
case GGML_TYPE_Q1_0:
{
nsg = N_SG_Q1_0;
nr0 = N_R0_Q1_0;
} break;
case GGML_TYPE_Q4_0:
{
nsg = N_SG_Q4_0;
@@ -953,11 +948,6 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_id(ggml_m
smem = 32*sizeof(float)*nr0;
suffix = ne00 % 4 == 0 ? "_4" : "";
} break;
case GGML_TYPE_Q1_0:
{
nsg = N_SG_Q1_0;
nr0 = N_R0_Q1_0;
} break;
case GGML_TYPE_Q4_0:
{
nsg = N_SG_Q4_0;

View File

@@ -1184,7 +1184,6 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
case GGML_TYPE_F16:
case GGML_TYPE_BF16:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q1_0:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
@@ -1211,7 +1210,6 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
default:
return false;
}
case GGML_TYPE_Q1_0:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:

View File

@@ -8,9 +8,6 @@
//
// TODO: for optimal performance, become function of the device and work size
#define N_R0_Q1_0 8
#define N_SG_Q1_0 2
#define N_R0_Q4_0 4
#define N_SG_Q4_0 2

View File

@@ -2047,7 +2047,6 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
op->src[0]->type == GGML_TYPE_F32 || // TODO: helper function
op->src[0]->type == GGML_TYPE_F16 ||
op->src[0]->type == GGML_TYPE_BF16 ||
op->src[0]->type == GGML_TYPE_Q1_0 ||
op->src[0]->type == GGML_TYPE_Q4_0 ||
op->src[0]->type == GGML_TYPE_Q4_1 ||
op->src[0]->type == GGML_TYPE_Q5_0 ||

View File

@@ -90,8 +90,6 @@ static ggml_backend_buffer_i ggml_backend_metal_buffer_shared_i = {
/* .memset_tensor = */ ggml_backend_metal_buffer_shared_memset_tensor,
/* .set_tensor = */ ggml_backend_metal_buffer_shared_set_tensor,
/* .get_tensor = */ ggml_backend_metal_buffer_shared_get_tensor,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ ggml_backend_metal_buffer_shared_cpy_tensor,
/* .clear = */ ggml_backend_metal_buffer_shared_clear,
/* .reset = */ NULL,
@@ -160,17 +158,15 @@ static void ggml_backend_metal_buffer_private_clear(ggml_backend_buffer_t buffer
}
static ggml_backend_buffer_i ggml_backend_metal_buffer_private_i = {
/* .free_buffer = */ ggml_backend_metal_buffer_private_free_buffer,
/* .get_base = */ ggml_backend_metal_buffer_private_get_base,
/* .init_tensor = */ NULL,
/* .memset_tensor = */ ggml_backend_metal_buffer_private_memset_tensor,
/* .set_tensor = */ ggml_backend_metal_buffer_private_set_tensor,
/* .get_tensor = */ ggml_backend_metal_buffer_private_get_tensor,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor = */ ggml_backend_metal_buffer_private_cpy_tensor,
/* .clear = */ ggml_backend_metal_buffer_private_clear,
/* .reset = */ NULL,
/* .free_buffer = */ ggml_backend_metal_buffer_private_free_buffer,
/* .get_base = */ ggml_backend_metal_buffer_private_get_base,
/* .init_tensor = */ NULL,
/* .memset_tensor = */ ggml_backend_metal_buffer_private_memset_tensor,
/* .set_tensor = */ ggml_backend_metal_buffer_private_set_tensor,
/* .get_tensor = */ ggml_backend_metal_buffer_private_get_tensor,
/* .cpy_tensor = */ ggml_backend_metal_buffer_private_cpy_tensor,
/* .clear = */ ggml_backend_metal_buffer_private_clear,
/* .reset = */ NULL,
};
static bool ggml_backend_buffer_is_metal(ggml_backend_buffer_t buffer) {
@@ -567,8 +563,6 @@ static ggml_backend_i ggml_backend_metal_i = {
/* .free = */ ggml_backend_metal_free,
/* .set_tensor_async = */ ggml_backend_metal_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_metal_get_tensor_async,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ ggml_backend_metal_cpy_tensor_async, // only needed for multi-GPU setups
/* .synchronize = */ ggml_backend_metal_synchronize,
/* .graph_plan_create = */ NULL,

View File

@@ -118,56 +118,6 @@ void dequantize_bf16_t4(device const bfloat4 * src, short il, thread type4 & reg
}
#endif
template <typename type4x4>
void dequantize_q1_0(device const block_q1_0 * xb, short il, thread type4x4 & reg) {
device const uint8_t * qs = xb->qs;
const float d = xb->d;
const float neg_d = -d;
const int byte_offset = il * 2; // il*16 bits = il*2 bytes
const uint8_t b0 = qs[byte_offset];
const uint8_t b1 = qs[byte_offset + 1];
float4x4 reg_f;
reg_f[0][0] = select(neg_d, d, bool(b0 & 0x01));
reg_f[0][1] = select(neg_d, d, bool(b0 & 0x02));
reg_f[0][2] = select(neg_d, d, bool(b0 & 0x04));
reg_f[0][3] = select(neg_d, d, bool(b0 & 0x08));
reg_f[1][0] = select(neg_d, d, bool(b0 & 0x10));
reg_f[1][1] = select(neg_d, d, bool(b0 & 0x20));
reg_f[1][2] = select(neg_d, d, bool(b0 & 0x40));
reg_f[1][3] = select(neg_d, d, bool(b0 & 0x80));
reg_f[2][0] = select(neg_d, d, bool(b1 & 0x01));
reg_f[2][1] = select(neg_d, d, bool(b1 & 0x02));
reg_f[2][2] = select(neg_d, d, bool(b1 & 0x04));
reg_f[2][3] = select(neg_d, d, bool(b1 & 0x08));
reg_f[3][0] = select(neg_d, d, bool(b1 & 0x10));
reg_f[3][1] = select(neg_d, d, bool(b1 & 0x20));
reg_f[3][2] = select(neg_d, d, bool(b1 & 0x40));
reg_f[3][3] = select(neg_d, d, bool(b1 & 0x80));
reg = (type4x4) reg_f;
}
template <typename type4>
void dequantize_q1_0_t4(device const block_q1_0 * xb, short il, thread type4 & reg) {
const float d = xb->d;
const float neg_d = -d;
const int base = il * 4;
const uint8_t byte = xb->qs[base / 8];
const int s = base % 8;
float4 reg_f;
reg_f[0] = select(neg_d, d, bool((byte >> (s )) & 1));
reg_f[1] = select(neg_d, d, bool((byte >> (s + 1)) & 1));
reg_f[2] = select(neg_d, d, bool((byte >> (s + 2)) & 1));
reg_f[3] = select(neg_d, d, bool((byte >> (s + 3)) & 1));
reg = (type4) reg_f;
}
template <typename type4x4>
void dequantize_q4_0(device const block_q4_0 * xb, short il, thread type4x4 & reg) {
device const uint16_t * qs = ((device const uint16_t *)xb + 1);
@@ -202,23 +152,6 @@ void dequantize_q4_0_t4(device const block_q4_0 * xb, short il, thread type4 & r
}
}
void quantize_q1_0(device const float * src, device block_q1_0 & dst) {
float sum_abs = 0.0f;
for (int j = 0; j < QK1_0; j++) {
sum_abs += fabs(src[j]);
}
dst.d = sum_abs / QK1_0;
for (int j = 0; j < QK1_0 / 8; j++) {
dst.qs[j] = 0;
}
for (int j = 0; j < QK1_0; j++) {
if (src[j] >= 0.0f) {
dst.qs[j / 8] |= (1 << (j % 8));
}
}
}
void quantize_q4_0(device const float * src, device block_q4_0 & dst) {
#pragma METAL fp math_mode(safe)
float amax = 0.0f; // absolute max
@@ -3183,35 +3116,6 @@ kernel void kernel_group_norm_f32(
}
}
// Q1_0 dot product: dot = d * (2 * Σ(yl[i] where bit=1) - sumy)
inline float block_q_n_dot_y(device const block_q1_0 * qb_curr, float sumy, thread float * yl, int il) {
device const uint8_t * qs = qb_curr->qs + il / 8;
const uint8_t b0 = qs[0];
const uint8_t b1 = qs[1];
float acc = 0.0f;
acc += select(0.0f, yl[ 0], bool(b0 & 0x01));
acc += select(0.0f, yl[ 1], bool(b0 & 0x02));
acc += select(0.0f, yl[ 2], bool(b0 & 0x04));
acc += select(0.0f, yl[ 3], bool(b0 & 0x08));
acc += select(0.0f, yl[ 4], bool(b0 & 0x10));
acc += select(0.0f, yl[ 5], bool(b0 & 0x20));
acc += select(0.0f, yl[ 6], bool(b0 & 0x40));
acc += select(0.0f, yl[ 7], bool(b0 & 0x80));
acc += select(0.0f, yl[ 8], bool(b1 & 0x01));
acc += select(0.0f, yl[ 9], bool(b1 & 0x02));
acc += select(0.0f, yl[10], bool(b1 & 0x04));
acc += select(0.0f, yl[11], bool(b1 & 0x08));
acc += select(0.0f, yl[12], bool(b1 & 0x10));
acc += select(0.0f, yl[13], bool(b1 & 0x20));
acc += select(0.0f, yl[14], bool(b1 & 0x40));
acc += select(0.0f, yl[15], bool(b1 & 0x80));
return qb_curr->d * (2.0f * acc - sumy);
}
// function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i])
// il indicates where the q4 quants begin (0 or QK4_0/4)
// we assume that the yl's have been multiplied with the appropriate scale factor
@@ -3433,85 +3337,6 @@ void mul_vec_q_n_f32_impl(
}
}
template<int nr0, typename args_t>
void kernel_mul_mv_q1_0_f32_impl(
args_t args,
device const char * src0,
device const char * src1,
device char * dst,
threadgroup char * shmem,
uint3 tgpig,
ushort tiisg,
ushort sgitg) {
const short NSG = FC_mul_mv_nsg;
const int nb = args.ne00/QK1_0;
const int r0 = tgpig.x;
const int r1 = tgpig.y;
const int im = tgpig.z;
const int first_row = (r0 * NSG + sgitg) * nr0;
const uint i12 = im%args.ne12;
const uint i13 = im/args.ne12;
const uint64_t offset1 = r1*args.nb11 + (i12)*args.nb12 + (i13)*args.nb13;
device const float * y = (device const float *) (src1 + offset1);
device const block_q1_0 * ax[nr0];
for (int row = 0; row < nr0; ++row) {
const uint64_t offset0 = (first_row + row)*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
ax[row] = (device const block_q1_0 *) ((device char *) src0 + offset0);
}
float yl[16];
float sumf[nr0] = {0.f};
const short ix = (tiisg/8);
const short il = (tiisg%8)*16;
device const float * yb = y + ix*QK1_0 + il;
for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/8) {
float sumy = 0.f;
FOR_UNROLL (short i = 0; i < 16; i++) {
yl[i] = yb[i];
sumy += yb[i];
}
FOR_UNROLL (short row = 0; row < nr0; row++) {
sumf[row] += block_q_n_dot_y(ax[row] + ib, sumy, yl, il);
}
yb += QK1_0 * (N_SIMDWIDTH/8);
}
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < nr0; ++row) {
const float tot = simd_sum(sumf[row]);
if (tiisg == 0 && first_row + row < args.ne01) {
dst_f32[first_row + row] = tot;
}
}
}
[[host_name("kernel_mul_mv_q1_0_f32")]]
kernel void kernel_mul_mv_q1_0_f32(
constant ggml_metal_kargs_mul_mv & args,
device const char * src0,
device const char * src1,
device char * dst,
uint3 tgpig[[threadgroup_position_in_grid]],
ushort tiisg[[thread_index_in_simdgroup]],
ushort sgitg[[simdgroup_index_in_threadgroup]]) {
kernel_mul_mv_q1_0_f32_impl<N_R0_Q1_0, constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg);
}
kernel void kernel_mul_mv_q4_0_f32(
constant ggml_metal_kargs_mul_mv & args,
device const char * src0,
@@ -3904,11 +3729,6 @@ template [[host_name("kernel_mul_mv_ext_bf16_f32_r1_4")]] kernel mul_mv_ext_q4
template [[host_name("kernel_mul_mv_ext_bf16_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, bfloat4, 4, dequantize_bf16_t4>;
#endif
template [[host_name("kernel_mul_mv_ext_q1_0_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_q1_0, 128, dequantize_q1_0_t4>;
template [[host_name("kernel_mul_mv_ext_q1_0_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_q1_0, 128, dequantize_q1_0_t4>;
template [[host_name("kernel_mul_mv_ext_q1_0_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_q1_0, 128, dequantize_q1_0_t4>;
template [[host_name("kernel_mul_mv_ext_q1_0_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, block_q1_0, 128, dequantize_q1_0_t4>;
template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_q4_0, 32, dequantize_q4_0_t4>;
template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_q4_0, 32, dequantize_q4_0_t4>;
template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_q4_0, 32, dequantize_q4_0_t4>;
@@ -7313,7 +7133,6 @@ kernel void kernel_cpy_f32_q(
typedef decltype(kernel_cpy_f32_q<QK8_0, block_q8_0, quantize_q8_0>) cpy_f_q_t;
template [[host_name("kernel_cpy_f32_q8_0")]] kernel cpy_f_q_t kernel_cpy_f32_q<QK8_0, block_q8_0, quantize_q8_0>;
template [[host_name("kernel_cpy_f32_q1_0")]] kernel cpy_f_q_t kernel_cpy_f32_q<QK1_0, block_q1_0, quantize_q1_0>;
template [[host_name("kernel_cpy_f32_q4_0")]] kernel cpy_f_q_t kernel_cpy_f32_q<QK4_0, block_q4_0, quantize_q4_0>;
template [[host_name("kernel_cpy_f32_q4_1")]] kernel cpy_f_q_t kernel_cpy_f32_q<QK4_1, block_q4_1, quantize_q4_1>;
template [[host_name("kernel_cpy_f32_q5_0")]] kernel cpy_f_q_t kernel_cpy_f32_q<QK5_0, block_q5_0, quantize_q5_0>;
@@ -7354,14 +7173,12 @@ kernel void kernel_cpy_q_f32(
typedef decltype(kernel_cpy_q_f32<float4x4, block_q4_0, 2, dequantize_q4_0>) cpy_q_f_t;
template [[host_name("kernel_cpy_q1_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q1_0, 8, dequantize_q1_0>;
template [[host_name("kernel_cpy_q4_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q4_0, 2, dequantize_q4_0>;
template [[host_name("kernel_cpy_q4_1_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q4_1, 2, dequantize_q4_1>;
template [[host_name("kernel_cpy_q5_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q5_0, 2, dequantize_q5_0>;
template [[host_name("kernel_cpy_q5_1_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q5_1, 2, dequantize_q5_1>;
template [[host_name("kernel_cpy_q8_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q8_0, 2, dequantize_q8_0>;
template [[host_name("kernel_cpy_q1_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q1_0, 8, dequantize_q1_0>;
template [[host_name("kernel_cpy_q4_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q4_0, 2, dequantize_q4_0>;
template [[host_name("kernel_cpy_q4_1_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q4_1, 2, dequantize_q4_1>;
template [[host_name("kernel_cpy_q5_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q5_0, 2, dequantize_q5_0>;
@@ -9959,7 +9776,6 @@ template [[host_name("kernel_get_rows_bf16")]] kernel get_rows_f_t kernel_get_ro
typedef decltype(kernel_get_rows_q<block_q4_0, 2, dequantize_q4_0>) get_rows_q_t;
template [[host_name("kernel_get_rows_q1_0")]] kernel get_rows_q_t kernel_get_rows_q<block_q1_0, 8, dequantize_q1_0>;
template [[host_name("kernel_get_rows_q4_0")]] kernel get_rows_q_t kernel_get_rows_q<block_q4_0, 2, dequantize_q4_0>;
template [[host_name("kernel_get_rows_q4_1")]] kernel get_rows_q_t kernel_get_rows_q<block_q4_1, 2, dequantize_q4_1>;
template [[host_name("kernel_get_rows_q5_0")]] kernel get_rows_q_t kernel_get_rows_q<block_q5_0, 2, dequantize_q5_0>;
@@ -10022,7 +9838,6 @@ template [[host_name("kernel_mul_mm_f16_f32")]] kernel mul_mm_t kernel_mul_m
#if defined(GGML_METAL_HAS_BF16)
template [[host_name("kernel_mul_mm_bf16_f32")]] kernel mul_mm_t kernel_mul_mm<bfloat, bfloat4x4, simdgroup_bfloat8x8, bfloat, bfloat2x4, simdgroup_bfloat8x8, bfloat4x4, 1, dequantize_bf16, bfloat, bfloat4x4, float, float2x4>;
#endif
template [[host_name("kernel_mul_mm_q1_0_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q1_0, 8, dequantize_q1_0, float, float4x4, float, float2x4>;
template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q4_0, 2, dequantize_q4_0, float, float4x4, float, float2x4>;
template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q4_1, 2, dequantize_q4_1, float, float4x4, float, float2x4>;
template [[host_name("kernel_mul_mm_q5_0_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q5_0, 2, dequantize_q5_0, float, float4x4, float, float2x4>;
@@ -10046,7 +9861,6 @@ template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mul_mm_t kernel_mul_m
template [[host_name("kernel_mul_mm_f32_f16")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, float4x4, 1, dequantize_f32, float, float4x4, half, half2x4>;
template [[host_name("kernel_mul_mm_f16_f16")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, half4x4, 1, dequantize_f16, half, half4x4, half, half2x4>;
template [[host_name("kernel_mul_mm_q1_0_f16")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q1_0, 8, dequantize_q1_0, float, float4x4, half, half2x4>;
template [[host_name("kernel_mul_mm_q4_0_f16")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q4_0, 2, dequantize_q4_0, float, float4x4, half, half2x4>;
template [[host_name("kernel_mul_mm_q4_1_f16")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q4_1, 2, dequantize_q4_1, float, float4x4, half, half2x4>;
template [[host_name("kernel_mul_mm_q5_0_f16")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q5_0, 2, dequantize_q5_0, float, float4x4, half, half2x4>;
@@ -10079,7 +9893,6 @@ template [[host_name("kernel_mul_mm_id_f16_f32")]] kernel mul_mm_id kernel_m
#if defined(GGML_METAL_HAS_BF16)
template [[host_name("kernel_mul_mm_id_bf16_f32")]] kernel mul_mm_id kernel_mul_mm_id<bfloat, bfloat4x4, simdgroup_bfloat8x8, bfloat, bfloat2x4, simdgroup_bfloat8x8, bfloat4x4, 1, dequantize_bf16, bfloat, bfloat4x4, float, float2x4>;
#endif
template [[host_name("kernel_mul_mm_id_q1_0_f32")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q1_0, 8, dequantize_q1_0, float, float4x4, float, float2x4>;
template [[host_name("kernel_mul_mm_id_q4_0_f32")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q4_0, 2, dequantize_q4_0, float, float4x4, float, float2x4>;
template [[host_name("kernel_mul_mm_id_q4_1_f32")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q4_1, 2, dequantize_q4_1, float, float4x4, float, float2x4>;
template [[host_name("kernel_mul_mm_id_q5_0_f32")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q5_0, 2, dequantize_q5_0, float, float4x4, float, float2x4>;
@@ -10103,7 +9916,6 @@ template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mul_mm_id kernel_m
template [[host_name("kernel_mul_mm_id_f32_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, float4x4, 1, dequantize_f32, float, float4x4, half, half2x4>;
template [[host_name("kernel_mul_mm_id_f16_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, half4x4, 1, dequantize_f16, half, half4x4, half, half2x4>;
template [[host_name("kernel_mul_mm_id_q1_0_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q1_0, 8, dequantize_q1_0, float, float4x4, half, half2x4>;
template [[host_name("kernel_mul_mm_id_q4_0_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q4_0, 2, dequantize_q4_0, float, float4x4, half, half2x4>;
template [[host_name("kernel_mul_mm_id_q4_1_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q4_1, 2, dequantize_q4_1, float, float4x4, half, half2x4>;
template [[host_name("kernel_mul_mm_id_q5_0_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q5_0, 2, dequantize_q5_0, float, float4x4, half, half2x4>;
@@ -10258,7 +10070,6 @@ template [[host_name("kernel_mul_mv_id_bf16_f32_4")]] kernel kernel_mul_mv_id_4
template [[host_name("kernel_mul_mv_id_q8_0_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q8_0_f32_impl<N_R0_Q8_0>>>;
template [[host_name("kernel_mul_mv_id_q1_0_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q1_0_f32_impl<N_R0_Q1_0>>>;
template [[host_name("kernel_mul_mv_id_q4_0_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<mul_vec_q_n_f32_impl<block_q4_0, N_R0_Q4_0>>>;
template [[host_name("kernel_mul_mv_id_q4_1_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<mul_vec_q_n_f32_impl<block_q4_1, N_R0_Q4_1>>>;
template [[host_name("kernel_mul_mv_id_q5_0_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<mul_vec_q_n_f32_impl<block_q5_0, N_R0_Q5_0>>>;

View File

@@ -4063,8 +4063,6 @@ static ggml_backend_i ggml_backend_opencl_i = {
/* .set_tensor_async = */ NULL, /* ggml_backend_opencl_set_tensor_async */
/* .get_tensor_async = */ NULL, /* ggml_backend_opencl_get_tensor_async */
/* .cpy_tensor_async = */ NULL, /* ggml_backend_opencl_cpy_tensor_async */
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .synchronize = */ ggml_backend_opencl_synchronize,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,
@@ -5780,8 +5778,6 @@ static ggml_backend_buffer_i ggml_backend_opencl_buffer_interface = {
/* .memset_tensor = */ NULL,
/* .set_tensor = */ ggml_backend_opencl_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_opencl_buffer_get_tensor,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ NULL,
/* .clear = */ ggml_backend_opencl_buffer_clear,
/* .reset = */ ggml_backend_opencl_buffer_reset,

View File

@@ -412,8 +412,6 @@ static const ggml_backend_buffer_i ggml_backend_openvino_buffer_interface = {
/* .memset_tensor = */ ggml_backend_openvino_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_openvino_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_openvino_buffer_get_tensor,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ ggml_backend_openvino_buffer_cpy_tensor,
/* .clear = */ ggml_backend_openvino_buffer_clear,
/* .reset = */ NULL,
@@ -619,8 +617,6 @@ static const ggml_backend_i ggml_backend_openvino_interface = {
/* .free = */ ggml_backend_openvino_free,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .get_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL,

View File

@@ -589,7 +589,6 @@ void ggml_opt_free(ggml_opt_context_t opt_ctx) {
ggml_backend_buffer_free(opt_ctx->buf_cpu);
ggml_free(opt_ctx->ctx_static);
ggml_free(opt_ctx->ctx_cpu);
ggml_free(opt_ctx->ctx_copy);
delete opt_ctx;
}

View File

@@ -706,8 +706,6 @@ static ggml_backend_buffer_i ggml_backend_rpc_buffer_interface = {
/* .memset_tensor = */ NULL,
/* .set_tensor = */ ggml_backend_rpc_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_rpc_buffer_get_tensor,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ ggml_backend_rpc_buffer_cpy_tensor,
/* .clear = */ ggml_backend_rpc_buffer_clear,
/* .reset = */ NULL,
@@ -896,8 +894,6 @@ static ggml_backend_i ggml_backend_rpc_interface = {
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .synchronize = */ ggml_backend_rpc_synchronize,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,

View File

@@ -44,10 +44,6 @@ void ggml_sycl_flash_attn_ext_tile(ggml_backend_sycl_context & ctx, ggml_tensor
GGML_ASSERT(V->ne[0] == K->ne[0]);
ggml_sycl_flash_attn_ext_tile_case<256, 256>(ctx, dst);
} break;
case 512: {
GGML_ASSERT(V->ne[0] == K->ne[0]);
ggml_sycl_flash_attn_ext_tile_case<512, 512>(ctx, dst);
} break;
case 576: {
GGML_ASSERT(V->ne[0] == 512);
ggml_sycl_flash_attn_ext_tile_case<576, 512>(ctx, dst);

View File

@@ -67,12 +67,6 @@ static constexpr uint32_t ggml_sycl_fattn_tile_get_config_fp16(const int DKQ, co
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(512, 512, 2, 64, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(512, 512, 4, 128, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(512, 512, 8, 256, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(512, 512, 16, 256, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(512, 512, 32, 256, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 4, 128, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 8, 256, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2, 64, 64)
@@ -130,12 +124,6 @@ static constexpr uint32_t ggml_sycl_fattn_tile_get_config_fp32(const int DKQ, co
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2, 32, 128)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2, 32, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(512, 512, 2, 128, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(512, 512, 4, 128, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(512, 512, 8, 256, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(512, 512, 16, 256, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(512, 512, 32, 256, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 4, 128, 2, 32, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 8, 256, 2, 32, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2, 32, 64)
@@ -143,6 +131,134 @@ static constexpr uint32_t ggml_sycl_fattn_tile_get_config_fp32(const int DKQ, co
return 0;
}
static constexpr uint32_t ggml_sycl_fattn_tile_get_config_amd(const int DKQ, const int DV, const int ncols) {
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 40, 40, 2, 64, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 40, 40, 4, 128, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 40, 40, 8, 256, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 40, 40, 16, 256, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 40, 40, 32, 256, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 40, 40, 64, 256, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 64, 64, 2, 64, 3, 32, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 64, 64, 4, 128, 3, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 64, 64, 8, 128, 2, 32, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 64, 64, 16, 256, 2, 128, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 64, 64, 32, 256, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 64, 64, 64, 256, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 72, 72, 2, 64, 2, 32, 72)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 72, 72, 4, 128, 2, 32, 72)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 72, 72, 8, 256, 2, 32, 72)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 72, 72, 16, 256, 2, 32, 72)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 72, 72, 32, 256, 2, 32, 72)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 72, 72, 64, 256, 2, 32, 72)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 80, 80, 2, 64, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 80, 80, 4, 128, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 80, 80, 8, 256, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 80, 80, 16, 256, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 80, 80, 32, 256, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 80, 80, 64, 256, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 96, 96, 2, 64, 2, 32, 48)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 96, 96, 4, 128, 2, 32, 48)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 96, 96, 8, 256, 2, 32, 48)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 96, 96, 16, 256, 2, 32, 48)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 96, 96, 32, 256, 2, 32, 48)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 96, 96, 64, 256, 2, 32, 48)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(112, 112, 2, 64, 2, 32, 56)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(112, 112, 4, 128, 2, 32, 56)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(112, 112, 8, 256, 2, 32, 56)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(112, 112, 16, 256, 2, 32, 56)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(112, 112, 32, 256, 2, 32, 56)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(112, 112, 64, 256, 2, 32, 56)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(128, 128, 2, 256, 2, 128, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(128, 128, 4, 128, 2, 64, 128)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(128, 128, 8, 256, 2, 64, 128)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(128, 128, 16, 256, 2, 64, 128)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(128, 128, 32, 256, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(128, 128, 64, 256, 2, 64, 32)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 2, 256, 2, 128, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 4, 256, 2, 64, 128)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 8, 256, 2, 64, 128)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2, 32, 128)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2, 32, 128)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 4, 128, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 8, 256, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 32, 512, 1, 128, 64)
return 0;
}
static constexpr uint32_t ggml_sycl_fattn_tile_get_config_amd_rdna(const int DKQ, const int DV, const int ncols) {
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 40, 40, 2, 64, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 40, 40, 4, 128, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 40, 40, 8, 256, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 40, 40, 16, 256, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 40, 40, 32, 256, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 40, 40, 64, 256, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 64, 64, 2, 64, 8, 32, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 64, 64, 4, 64, 8, 32, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 64, 64, 8, 128, 5, 128, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 64, 64, 16, 128, 5, 128, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 64, 64, 32, 128, 4, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 64, 64, 64, 128, 5, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 72, 72, 2, 64, 2, 32, 72)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 72, 72, 4, 128, 2, 32, 72)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 72, 72, 8, 256, 2, 32, 72)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 72, 72, 16, 256, 2, 32, 72)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 72, 72, 32, 256, 2, 32, 72)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 72, 72, 64, 256, 2, 32, 72)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 80, 80, 2, 64, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 80, 80, 4, 128, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 80, 80, 8, 256, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 80, 80, 16, 256, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 80, 80, 32, 256, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 80, 80, 64, 256, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 96, 96, 2, 64, 2, 32, 48)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 96, 96, 4, 128, 2, 32, 48)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 96, 96, 8, 256, 2, 32, 48)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 96, 96, 16, 256, 2, 32, 48)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 96, 96, 32, 256, 2, 32, 48)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 96, 96, 64, 256, 2, 32, 48)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(112, 112, 2, 64, 2, 32, 56)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(112, 112, 4, 128, 2, 32, 56)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(112, 112, 8, 256, 2, 32, 56)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(112, 112, 16, 256, 2, 32, 56)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(112, 112, 32, 256, 2, 32, 56)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(112, 112, 64, 256, 2, 32, 56)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(128, 128, 2, 64, 8, 32, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(128, 128, 4, 128, 8, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(128, 128, 8, 128, 8, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(128, 128, 16, 256, 3, 128, 128)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(128, 128, 32, 256, 3, 128, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(128, 128, 64, 256, 3, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 2, 64, 8, 32, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 4, 128, 6, 32, 256)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 8, 128, 6, 32, 256)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 5, 32, 256)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 3, 64, 128)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 4, 128, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 8, 256, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 4, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 32, 256, 2, 128, 64)
return 0;
}
static constexpr uint32_t ggml_sycl_fattn_tile_get_config(const int DKQ, const int DV, const int ncols, const int cc) {
if(fast_fp16_available(cc))
return ggml_sycl_fattn_tile_get_config_fp16(DKQ, DV, ncols);
@@ -1177,16 +1293,6 @@ static void launch_fattn_tile_switch_ncols2(ggml_backend_sycl_context & ctx, ggm
launch_fattn_tile_switch_ncols1<DKQ, DV, 4, use_logit_softcap>(ctx, dst);
return;
}
// ncols2=2 and ncols2=1 fallbacks only for cases where ncols=2 config exists (DKQ == DV).
// For DKQ == 576, DV == 512 only GQA-optimized variants are implemented.
if constexpr (DKQ == DV) {
if (use_gqa_opt && gqa_ratio % 2 == 0) {
launch_fattn_tile_switch_ncols1<DKQ, DV, 2, use_logit_softcap>(ctx, dst);
return;
}
launch_fattn_tile_switch_ncols1<DKQ, DV, 1, use_logit_softcap>(ctx, dst);
return;
}
}
if constexpr (DV <= 256) {
@@ -1241,6 +1347,5 @@ extern DECL_FATTN_TILE_CASE( 96, 96);
extern DECL_FATTN_TILE_CASE(112, 112);
extern DECL_FATTN_TILE_CASE(128, 128);
extern DECL_FATTN_TILE_CASE(256, 256);
extern DECL_FATTN_TILE_CASE(512, 512);
extern DECL_FATTN_TILE_CASE(576, 512);

View File

@@ -664,11 +664,4 @@ EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q5_0)
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q5_1)
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q8_0)
EXTERN_DECL_FATTN_VEC_CASES(512, GGML_TYPE_F16)
EXTERN_DECL_FATTN_VEC_CASES(512, GGML_TYPE_Q4_0)
EXTERN_DECL_FATTN_VEC_CASES(512, GGML_TYPE_Q4_1)
EXTERN_DECL_FATTN_VEC_CASES(512, GGML_TYPE_Q5_0)
EXTERN_DECL_FATTN_VEC_CASES(512, GGML_TYPE_Q5_1)
EXTERN_DECL_FATTN_VEC_CASES(512, GGML_TYPE_Q8_0)
#endif // GGML_SYCL_FATTN_VEC_HPP

View File

@@ -34,7 +34,6 @@
FATTN_VEC_CASE( 64, type_K, type_V) \
FATTN_VEC_CASE(128, type_K, type_V) \
FATTN_VEC_CASE(256, type_K, type_V) \
FATTN_VEC_CASE(512, type_K, type_V) \
static void ggml_sycl_flash_attn_ext_vec(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_tensor * Q = dst->src[0];
@@ -142,7 +141,6 @@ static best_fattn_kernel ggml_sycl_get_best_fattn_kernel(const int device, const
case 128:
case 112:
case 256:
case 512:
if (V->ne[0] != K->ne[0]) {
return BEST_FATTN_KERNEL_NONE;
}
@@ -187,7 +185,7 @@ static best_fattn_kernel ggml_sycl_get_best_fattn_kernel(const int device, const
}
// For small batch sizes the vector kernel may be preferable over the kernels optimized for large batch sizes:
const bool can_use_vector_kernel = Q->ne[0] <= 512 && Q->ne[0] % 64 == 0 && K->ne[1] % FATTN_KQ_STRIDE == 0;
const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % 64 == 0 && K->ne[1] % FATTN_KQ_STRIDE == 0;
// Todo: Use the XMX kernel if possible:

View File

@@ -411,22 +411,11 @@ ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
assert(tensor->view_src->buffer->buft == buffer->buft);
return GGML_STATUS_SUCCESS;
}
if (!g_ggml_sycl_disable_optimize) {
// set reorder extra buffer based on supported type
switch (tensor->type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q6_K:{
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
tensor->extra = extra;
ctx->tensor_extras.push_back(extra);
break;
}
default:
break;
}
if ((tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q8_0 || tensor->type == GGML_TYPE_Q4_K || tensor->type == GGML_TYPE_Q6_K) &&
!g_ggml_sycl_disable_optimize) {
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
tensor->extra = extra;
ctx->tensor_extras.push_back(extra); //used to release it when destroy ctx.
}
if (ggml_is_quantized(tensor->type)) {
@@ -638,8 +627,6 @@ static const ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = {
/* .memset_tensor = */ ggml_backend_sycl_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_sycl_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_sycl_buffer_get_tensor,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ ggml_backend_sycl_buffer_cpy_tensor,
/* .clear = */ ggml_backend_sycl_buffer_clear,
/* .reset = */ ggml_backend_sycl_buffer_reset,
@@ -1086,8 +1073,6 @@ static struct ggml_backend_buffer_i ggml_backend_sycl_split_buffer_interface = {
/* .memset_tensor = */ NULL,
/* .set_tensor = */ ggml_backend_sycl_split_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_sycl_split_buffer_get_tensor,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ NULL,
/* .clear = */ ggml_backend_sycl_split_buffer_clear,
/* .reset = */ NULL,
@@ -4557,8 +4542,6 @@ static ggml_backend_i ggml_backend_sycl_interface = {
/* .free = */ ggml_backend_sycl_free,
/* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ NULL, // ggml_backend_sycl_cpy_tensor_async,
// // TODO: update for the new
// interface

View File

@@ -1,6 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-tile.hpp"
DECL_FATTN_TILE_CASE(512, 512);

View File

@@ -5,4 +5,3 @@
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16);
DECL_FATTN_VEC_CASE(512, GGML_TYPE_F16, GGML_TYPE_F16);

View File

@@ -5,4 +5,3 @@
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_0);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_F16, GGML_TYPE_Q4_0);
DECL_FATTN_VEC_CASE(512, GGML_TYPE_F16, GGML_TYPE_Q4_0);

View File

@@ -5,4 +5,3 @@
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_1);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_F16, GGML_TYPE_Q4_1);
DECL_FATTN_VEC_CASE(512, GGML_TYPE_F16, GGML_TYPE_Q4_1);

View File

@@ -5,4 +5,3 @@
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_0);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_F16, GGML_TYPE_Q5_0);
DECL_FATTN_VEC_CASE(512, GGML_TYPE_F16, GGML_TYPE_Q5_0);

View File

@@ -5,4 +5,3 @@
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_1);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_F16, GGML_TYPE_Q5_1);
DECL_FATTN_VEC_CASE(512, GGML_TYPE_F16, GGML_TYPE_Q5_1);

View File

@@ -5,4 +5,3 @@
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q8_0);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_F16, GGML_TYPE_Q8_0);
DECL_FATTN_VEC_CASE(512, GGML_TYPE_F16, GGML_TYPE_Q8_0);

View File

@@ -5,4 +5,3 @@
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q4_0, GGML_TYPE_F16);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q4_0, GGML_TYPE_F16);
DECL_FATTN_VEC_CASE(512, GGML_TYPE_Q4_0, GGML_TYPE_F16);

View File

@@ -5,4 +5,3 @@
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0);
DECL_FATTN_VEC_CASE(512, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0);

View File

@@ -5,4 +5,3 @@
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1);
DECL_FATTN_VEC_CASE(512, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1);

View File

@@ -5,4 +5,3 @@
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0);
DECL_FATTN_VEC_CASE(512, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0);

View File

@@ -5,4 +5,3 @@
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1);
DECL_FATTN_VEC_CASE(512, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1);

View File

@@ -5,4 +5,3 @@
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0);
DECL_FATTN_VEC_CASE(512, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0);

View File

@@ -5,4 +5,3 @@
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q4_1, GGML_TYPE_F16);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q4_1, GGML_TYPE_F16);
DECL_FATTN_VEC_CASE(512, GGML_TYPE_Q4_1, GGML_TYPE_F16);

View File

@@ -5,4 +5,3 @@
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0);
DECL_FATTN_VEC_CASE(512, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0);

View File

@@ -5,4 +5,3 @@
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1);
DECL_FATTN_VEC_CASE(512, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1);

View File

@@ -5,4 +5,3 @@
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0);
DECL_FATTN_VEC_CASE(512, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0);

View File

@@ -5,4 +5,3 @@
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1);
DECL_FATTN_VEC_CASE(512, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1);

View File

@@ -5,4 +5,3 @@
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0);
DECL_FATTN_VEC_CASE(512, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0);

View File

@@ -5,4 +5,3 @@
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q5_0, GGML_TYPE_F16);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q5_0, GGML_TYPE_F16);
DECL_FATTN_VEC_CASE(512, GGML_TYPE_Q5_0, GGML_TYPE_F16);

View File

@@ -5,4 +5,3 @@
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0);
DECL_FATTN_VEC_CASE(512, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0);

View File

@@ -5,4 +5,3 @@
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1);
DECL_FATTN_VEC_CASE(512, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1);

View File

@@ -5,4 +5,3 @@
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0);
DECL_FATTN_VEC_CASE(512, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0);

View File

@@ -5,4 +5,3 @@
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1);
DECL_FATTN_VEC_CASE(512, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1);

View File

@@ -5,4 +5,3 @@
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0);
DECL_FATTN_VEC_CASE(512, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0);

View File

@@ -5,4 +5,3 @@
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q5_1, GGML_TYPE_F16);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q5_1, GGML_TYPE_F16);
DECL_FATTN_VEC_CASE(512, GGML_TYPE_Q5_1, GGML_TYPE_F16);

View File

@@ -5,4 +5,3 @@
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0);
DECL_FATTN_VEC_CASE(512, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0);

View File

@@ -5,4 +5,3 @@
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1);
DECL_FATTN_VEC_CASE(512, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1);

View File

@@ -5,4 +5,3 @@
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0);
DECL_FATTN_VEC_CASE(512, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0);

View File

@@ -5,4 +5,3 @@
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1);
DECL_FATTN_VEC_CASE(512, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1);

View File

@@ -5,4 +5,3 @@
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0);
DECL_FATTN_VEC_CASE(512, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0);

View File

@@ -5,4 +5,3 @@
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q8_0, GGML_TYPE_F16);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_F16);
DECL_FATTN_VEC_CASE(512, GGML_TYPE_Q8_0, GGML_TYPE_F16);

View File

@@ -5,4 +5,3 @@
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0);
DECL_FATTN_VEC_CASE(512, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0);

View File

@@ -5,4 +5,3 @@
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1);
DECL_FATTN_VEC_CASE(512, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1);

View File

@@ -5,4 +5,3 @@
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0);
DECL_FATTN_VEC_CASE(512, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0);

View File

@@ -5,4 +5,3 @@
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1);
DECL_FATTN_VEC_CASE(512, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1);

View File

@@ -5,4 +5,3 @@
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0);
DECL_FATTN_VEC_CASE(512, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0);

View File

@@ -101,8 +101,6 @@ const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = {
/* .memset_tensor = */ NULL,
/* .set_tensor = */ ggml_backend_remoting_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_remoting_buffer_get_tensor,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ ggml_backend_remoting_buffer_cpy_tensor,
/* .clear = */ ggml_backend_remoting_buffer_clear,
/* .reset = */ NULL,
@@ -115,8 +113,6 @@ const ggml_backend_buffer_i ggml_backend_remoting_buffer_from_ptr_interface = {
/* .memset_tensor = */ NULL,
/* .set_tensor = */ ggml_backend_remoting_buffer_set_tensor_from_ptr,
/* .get_tensor = */ ggml_backend_remoting_buffer_get_tensor_from_ptr,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ ggml_backend_remoting_buffer_cpy_tensor,
/* .clear = */ ggml_backend_remoting_buffer_clear,
/* .reset = */ NULL,

View File

@@ -34,8 +34,6 @@ static ggml_backend_i ggml_backend_remoting_interface = {
/* .free = */ ggml_backend_remoting_free,
/* .set_tensor_async = */ NULL, // ggml_backend_remoting_set_tensor_async,
/* .get_tensor_async = */ NULL, // ggml_backend_remoting_get_tensor_async,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ NULL, // ggml_backend_remoting_cpy_tensor_async,
/* .synchronize = */ NULL, // ggml_backend_remoting_synchronize,
/* .graph_plan_create = */ NULL,

View File

@@ -120,6 +120,10 @@ if (Vulkan_FOUND)
add_compile_definitions(GGML_VULKAN_RUN_TESTS)
endif()
if (GGML_VULKAN_COPY_TESTS)
add_compile_definitions(GGML_VULKAN_COPY_TESTS)
endif()
# Set up toolchain for host compilation whether cross-compiling or not
if (CMAKE_CROSSCOMPILING)
if (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN)

View File

@@ -1,9 +1,12 @@
#include "ggml-vulkan.h"
#include <vulkan/vulkan_core.h>
#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_CHECK_RESULTS)
#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_CHECK_RESULTS) || defined(GGML_VULKAN_COPY_TESTS)
#include <chrono>
#include "ggml-cpu.h"
#endif
#if defined(GGML_VULKAN_COPY_TESTS) && !defined(_WIN32)
#include <unistd.h>
#endif
// See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers-
#define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1
@@ -591,6 +594,7 @@ struct vk_device_struct {
uint64_t suballocation_block_size;
uint64_t min_imported_host_pointer_alignment;
bool external_memory_host {};
bool external_semaphore_fd {};
bool fp16;
bool bf16;
bool pipeline_robustness;
@@ -1659,6 +1663,7 @@ struct ggml_vk_garbage_collector {
static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx, vk_context subctx);
static void ggml_vk_load_shaders(vk_device& device);
static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx);
static vk_buffer ggml_vk_buffer_from_host_ptr(vk_device & device, void * ptr, size_t size);
static bool vk_memory_logger_enabled = false;
@@ -4882,6 +4887,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
device->memory_priority = true;
} else if (strcmp("VK_EXT_external_memory_host", properties.extensionName) == 0) {
device->external_memory_host = true;
} else if (strcmp("VK_KHR_external_semaphore_fd", properties.extensionName) == 0) {
device->external_semaphore_fd = true;
#if defined(VK_EXT_shader_64bit_indexing)
} else if (strcmp("VK_EXT_shader_64bit_indexing", properties.extensionName) == 0) {
device->shader_64b_indexing = true;
@@ -5181,6 +5188,10 @@ static vk_device ggml_vk_get_device(size_t idx) {
device_extensions.push_back("VK_EXT_external_memory_host");
}
if (device->external_semaphore_fd) {
device_extensions.push_back("VK_KHR_external_semaphore_fd");
}
#if defined(VK_EXT_shader_64bit_indexing)
VkPhysicalDeviceShader64BitIndexingFeaturesEXT shader_64bit_indexing_features {};
shader_64bit_indexing_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_64_BIT_INDEXING_FEATURES_EXT;
@@ -12630,7 +12641,654 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
}
#endif
#ifdef GGML_VULKAN_COPY_TESTS
// Cross-device copy benchmark
// Tests different approaches to copying data between two Vulkan devices.
// Build with -DGGML_VULKAN_COPY_TESTS and run any llama.cpp command with >= 2 Vulkan devices.
// Helper: allocate shared staging buffer importable by both devices
struct vk_shared_staging {
void * host_ptr = nullptr;
vk_buffer buf_dev0;
vk_buffer buf_dev1;
size_t size = 0;
bool alloc(vk_device & dev0, vk_device & dev1, size_t sz) {
size_t align = std::max(dev0->min_imported_host_pointer_alignment,
dev1->min_imported_host_pointer_alignment);
size = (sz + align - 1) & ~(align - 1);
#ifdef _WIN32
host_ptr = _aligned_malloc(size, align);
#else
if (posix_memalign(&host_ptr, align, size) != 0) { host_ptr = nullptr; }
#endif
if (!host_ptr) return false;
buf_dev0 = ggml_vk_buffer_from_host_ptr(dev0, host_ptr, size);
buf_dev1 = ggml_vk_buffer_from_host_ptr(dev1, host_ptr, size);
return buf_dev0 && buf_dev1;
}
void free_resources() {
ggml_vk_destroy_buffer(buf_dev0);
ggml_vk_destroy_buffer(buf_dev1);
#ifdef _WIN32
_aligned_free(host_ptr);
#else
free(host_ptr);
#endif
host_ptr = nullptr;
}
};
// Helper: run a benchmark and print results
static void vk_bench_print(const char * name, std::vector<double> & times, size_t size) {
std::sort(times.begin(), times.end());
double median = times[times.size() / 2];
double bw = (size / (1024.0 * 1024.0 * 1024.0)) / (median / 1000.0);
std::cerr << " " << std::left << std::setw(22) << name << " : "
<< std::fixed << std::setprecision(3) << median << " ms "
<< std::setprecision(2) << bw << " GB/s" << std::endl;
}
// Results stored per (method, size) for table output
struct vk_copy_result {
std::string method;
double ms;
double gbps;
};
static void ggml_vk_bench_pair(
vk_device & dev0, vk_device & dev1,
const std::vector<size_t> & test_sizes,
std::map<std::string, std::vector<vk_copy_result>> & results) {
const size_t num_it = 20;
const size_t warmup = 3;
const size_t max_size = test_sizes.back();
// Allocate buffers
vk_buffer buf_src = ggml_vk_create_buffer_check(dev0, max_size, {vk::MemoryPropertyFlagBits::eDeviceLocal});
vk_buffer buf_dst = ggml_vk_create_buffer_check(dev1, max_size, {vk::MemoryPropertyFlagBits::eDeviceLocal});
vk_buffer staging_src = ggml_vk_create_buffer_check(dev0, max_size,
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
vk_buffer staging_dst = ggml_vk_create_buffer_check(dev1, max_size,
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
// Fill source
{
vk_context subctx = ggml_vk_create_temporary_context(dev0->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dev0, subctx);
subctx->s->buffer->buf.fillBuffer(buf_src->buffer, 0, max_size, 0xDEADBEEF);
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, dev0->fence);
VK_CHECK(dev0->device.waitForFences({ dev0->fence }, true, UINT64_MAX), "fill");
dev0->device.resetFences({ dev0->fence });
}
bool has_shared_staging = dev0->external_memory_host && dev1->external_memory_host;
bool has_syncfd = false;
#ifndef _WIN32
if (dev0->external_semaphore_fd && dev1->external_semaphore_fd) {
vk::PhysicalDeviceExternalSemaphoreInfo query{};
query.handleType = vk::ExternalSemaphoreHandleTypeFlagBits::eSyncFd;
auto p0 = dev0->physical_device.getExternalSemaphoreProperties(query);
auto p1 = dev1->physical_device.getExternalSemaphoreProperties(query);
has_syncfd =
(p0.externalSemaphoreFeatures & vk::ExternalSemaphoreFeatureFlagBits::eExportable) &&
(p0.compatibleHandleTypes & vk::ExternalSemaphoreHandleTypeFlagBits::eSyncFd) &&
(p1.externalSemaphoreFeatures & vk::ExternalSemaphoreFeatureFlagBits::eImportable) &&
(p1.compatibleHandleTypes & vk::ExternalSemaphoreHandleTypeFlagBits::eSyncFd);
}
#endif
// Helper to record a result
auto record = [&](const std::string & method, size_t size, std::vector<double> & times) {
std::sort(times.begin(), times.end());
double median = times[times.size() / 2];
double bw = (size / (1024.0 * 1024.0 * 1024.0)) / (median / 1000.0);
results[method].push_back({ method, median, bw });
};
// Helper to record a skipped size (sentinel: negative ms)
auto skip = [&](const std::string & method) {
results[method].push_back({ method, -1.0, -1.0 });
};
for (size_t size : test_sizes) {
// =================================================================
// 1. Baseline: current sync double-hop (separate staging buffers + memcpy)
// =================================================================
{
std::vector<double> times;
for (size_t i = 0; i < num_it + warmup; i++) {
auto begin = std::chrono::high_resolution_clock::now();
{
std::lock_guard<std::recursive_mutex> guard(dev0->mutex);
vk_context subctx = ggml_vk_create_temporary_context(dev0->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dev0, subctx);
ggml_vk_buffer_copy_async(subctx, staging_src, 0, buf_src, 0, size);
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, dev0->fence);
VK_CHECK(dev0->device.waitForFences({ dev0->fence }, true, UINT64_MAX), "baseline hop1");
dev0->device.resetFences({ dev0->fence });
}
memcpy(staging_dst->ptr, staging_src->ptr, size);
{
std::lock_guard<std::recursive_mutex> guard(dev1->mutex);
vk_context subctx = ggml_vk_create_temporary_context(dev1->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dev1, subctx);
ggml_vk_buffer_copy_async(subctx, buf_dst, 0, staging_dst, 0, size);
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, dev1->fence);
VK_CHECK(dev1->device.waitForFences({ dev1->fence }, true, UINT64_MAX), "baseline hop2");
dev1->device.resetFences({ dev1->fence });
}
auto end = std::chrono::high_resolution_clock::now();
if (i >= warmup) times.push_back(std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() / 1000.0);
}
record("baseline", size, times);
}
// =================================================================
// 2. Diagnostics: individual hop timings
// =================================================================
{
std::vector<double> times;
for (size_t i = 0; i < num_it + warmup; i++) {
auto begin = std::chrono::high_resolution_clock::now();
{
std::lock_guard<std::recursive_mutex> guard(dev0->mutex);
vk_context subctx = ggml_vk_create_temporary_context(dev0->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dev0, subctx);
ggml_vk_buffer_copy_async(subctx, staging_src, 0, buf_src, 0, size);
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, dev0->fence);
VK_CHECK(dev0->device.waitForFences({ dev0->fence }, true, UINT64_MAX), "diag hop1");
dev0->device.resetFences({ dev0->fence });
}
auto end = std::chrono::high_resolution_clock::now();
if (i >= warmup) times.push_back(std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() / 1000.0);
}
record("hop1_only", size, times);
}
{
std::vector<double> times;
for (size_t i = 0; i < num_it + warmup; i++) {
auto begin = std::chrono::high_resolution_clock::now();
{
std::lock_guard<std::recursive_mutex> guard(dev1->mutex);
vk_context subctx = ggml_vk_create_temporary_context(dev1->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dev1, subctx);
ggml_vk_buffer_copy_async(subctx, buf_dst, 0, staging_dst, 0, size);
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, dev1->fence);
VK_CHECK(dev1->device.waitForFences({ dev1->fence }, true, UINT64_MAX), "diag hop2");
dev1->device.resetFences({ dev1->fence });
}
auto end = std::chrono::high_resolution_clock::now();
if (i >= warmup) times.push_back(std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() / 1000.0);
}
record("hop2_only", size, times);
}
// =================================================================
// 3. Shared staging: single host buffer imported into both devices
// =================================================================
if (has_shared_staging) {
vk_shared_staging stg;
if (stg.alloc(dev0, dev1, size)) {
std::vector<double> times;
for (size_t i = 0; i < num_it + warmup; i++) {
auto begin = std::chrono::high_resolution_clock::now();
{
std::lock_guard<std::recursive_mutex> guard(dev0->mutex);
vk_context subctx = ggml_vk_create_temporary_context(dev0->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dev0, subctx);
ggml_vk_buffer_copy_async(subctx, stg.buf_dev0, 0, buf_src, 0, size);
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, dev0->fence);
VK_CHECK(dev0->device.waitForFences({ dev0->fence }, true, UINT64_MAX), "shared hop1");
dev0->device.resetFences({ dev0->fence });
}
{
std::lock_guard<std::recursive_mutex> guard(dev1->mutex);
vk_context subctx = ggml_vk_create_temporary_context(dev1->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dev1, subctx);
ggml_vk_buffer_copy_async(subctx, buf_dst, 0, stg.buf_dev1, 0, size);
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, dev1->fence);
VK_CHECK(dev1->device.waitForFences({ dev1->fence }, true, UINT64_MAX), "shared hop2");
dev1->device.resetFences({ dev1->fence });
}
auto end = std::chrono::high_resolution_clock::now();
if (i >= warmup) times.push_back(std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() / 1000.0);
}
record("shared_staging", size, times);
} else {
std::cerr << " shared_staging : SKIPPED (import failed)" << std::endl;
}
stg.free_resources();
}
// =================================================================
// 4. Chunked pipeline: split into N chunks, overlap hop1/hop2
// via full-duplex PCIe. Vary chunk count to find optimum.
// =================================================================
if (has_shared_staging) {
for (size_t n_chunks : { 2, 4, 8 }) {
char cname[32];
snprintf(cname, sizeof(cname), "chunked_%zu", n_chunks);
if (size < n_chunks * 4096) { skip(cname); continue; }
size_t align = std::max(dev0->min_imported_host_pointer_alignment,
dev1->min_imported_host_pointer_alignment);
size_t chunk_data = size / n_chunks;
size_t chunk_aligned = (chunk_data + align - 1) & ~(align - 1);
vk_shared_staging stg;
if (!stg.alloc(dev0, dev1, chunk_aligned * n_chunks)) {
std::cerr << " chunked_" << n_chunks << " : SKIPPED (import failed)" << std::endl;
stg.free_resources();
continue;
}
// Per-chunk timeline semaphores
std::vector<vk::Semaphore> chunk_sems(n_chunks);
std::vector<uint64_t> sem_vals(n_chunks, 0);
for (size_t c = 0; c < n_chunks; c++) {
vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 };
vk::SemaphoreCreateInfo sci{};
sci.setPNext(&tci);
chunk_sems[c] = dev0->device.createSemaphore(sci);
}
std::vector<double> times;
for (size_t iter = 0; iter < num_it + warmup; iter++) {
auto begin = std::chrono::high_resolution_clock::now();
// Submit all hop1s upfront
for (size_t c = 0; c < n_chunks; c++) {
size_t off_src = c * chunk_data;
size_t off_stg = c * chunk_aligned;
size_t csz = (c == n_chunks - 1) ? (size - c * chunk_data) : chunk_data;
vk_context subctx = ggml_vk_create_temporary_context(dev0->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dev0, subctx);
ggml_vk_buffer_copy_async(subctx, stg.buf_dev0, off_stg, buf_src, off_src, csz);
sem_vals[c]++;
subctx->s->signal_semaphores.push_back({ chunk_sems[c], sem_vals[c] });
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, {});
}
// Per-chunk: CPU wait hop1, submit hop2
for (size_t c = 0; c < n_chunks; c++) {
size_t off_dst = c * chunk_data;
size_t off_stg = c * chunk_aligned;
size_t csz = (c == n_chunks - 1) ? (size - c * chunk_data) : chunk_data;
vk::SemaphoreWaitInfo swi{vk::SemaphoreWaitFlags{}, chunk_sems[c], sem_vals[c]};
VK_CHECK(dev0->device.waitSemaphores(swi, UINT64_MAX), "chunked sem wait");
vk_context subctx = ggml_vk_create_temporary_context(dev1->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dev1, subctx);
ggml_vk_buffer_copy_async(subctx, buf_dst, off_dst, stg.buf_dev1, off_stg, csz);
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, (c == n_chunks - 1) ? dev1->fence : vk::Fence{});
}
VK_CHECK(dev1->device.waitForFences({ dev1->fence }, true, UINT64_MAX), "chunked final");
dev1->device.resetFences({ dev1->fence });
auto end = std::chrono::high_resolution_clock::now();
if (iter >= warmup) times.push_back(std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() / 1000.0);
}
char name[32];
snprintf(name, sizeof(name), "chunked_%zu", n_chunks);
record(name, size, times);
for (size_t c = 0; c < n_chunks; c++) dev0->device.destroySemaphore(chunk_sems[c]);
stg.free_resources();
}
}
// =================================================================
// 5. sync_fd async: fully GPU-synchronised via Linux sync_file
// =================================================================
#ifndef _WIN32
if (has_shared_staging && has_syncfd) {
vk_shared_staging stg;
if (stg.alloc(dev0, dev1, size)) {
std::vector<double> times;
bool run_ok = true;
for (size_t i = 0; i < num_it + warmup && run_ok; i++) {
auto begin = std::chrono::high_resolution_clock::now();
vk::ExportSemaphoreCreateInfo esci{};
esci.handleTypes = vk::ExternalSemaphoreHandleTypeFlagBits::eSyncFd;
vk::SemaphoreCreateInfo sci{};
sci.setPNext(&esci);
vk::Semaphore sem_dev0 = dev0->device.createSemaphore(sci);
// Hop 1 + signal
{
vk_context subctx = ggml_vk_create_temporary_context(dev0->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dev0, subctx);
ggml_vk_buffer_copy_async(subctx, stg.buf_dev0, 0, buf_src, 0, size);
subctx->s->signal_semaphores.push_back({ sem_dev0, 0 });
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, {});
}
// Export + import sync_fd
int sync_fd = -1;
try {
vk::SemaphoreGetFdInfoKHR gi{};
gi.semaphore = sem_dev0;
gi.handleType = vk::ExternalSemaphoreHandleTypeFlagBits::eSyncFd;
sync_fd = dev0->device.getSemaphoreFdKHR(gi);
} catch (vk::SystemError& e) {
std::cerr << " syncfd_async : SKIPPED (export: " << e.what() << ")" << std::endl;
dev0->device.destroySemaphore(sem_dev0);
run_ok = false; break;
}
vk::Semaphore sem_dev1 = dev1->device.createSemaphore({});
try {
vk::ImportSemaphoreFdInfoKHR ii{};
ii.semaphore = sem_dev1;
ii.handleType = vk::ExternalSemaphoreHandleTypeFlagBits::eSyncFd;
ii.flags = vk::SemaphoreImportFlagBits::eTemporary;
ii.fd = sync_fd;
dev1->device.importSemaphoreFdKHR(ii);
} catch (vk::SystemError& e) {
std::cerr << " syncfd_async : SKIPPED (import: " << e.what() << ")" << std::endl;
dev0->device.destroySemaphore(sem_dev0);
dev1->device.destroySemaphore(sem_dev1);
close(sync_fd);
run_ok = false; break;
}
// Hop 2 with GPU-side wait
{
vk_context subctx = ggml_vk_create_temporary_context(dev1->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dev1, subctx);
subctx->s->wait_semaphores.push_back({ sem_dev1, 0 });
ggml_vk_buffer_copy_async(subctx, buf_dst, 0, stg.buf_dev1, 0, size);
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, dev1->fence);
VK_CHECK(dev1->device.waitForFences({ dev1->fence }, true, UINT64_MAX), "syncfd final");
dev1->device.resetFences({ dev1->fence });
}
dev0->device.destroySemaphore(sem_dev0);
dev1->device.destroySemaphore(sem_dev1);
auto end = std::chrono::high_resolution_clock::now();
if (i >= warmup) times.push_back(std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() / 1000.0);
}
if (run_ok) record("syncfd_async", size, times);
} else {
std::cerr << " syncfd_async : SKIPPED (import failed)" << std::endl;
}
stg.free_resources();
}
// =================================================================
// 6. sync_fd chunked: chunked pipeline with GPU-side sync_fd
// between hops (no CPU waits between chunks)
// =================================================================
if (has_shared_staging && has_syncfd) {
for (size_t n_chunks : { 2, 4, 8 }) {
char scname[48];
snprintf(scname, sizeof(scname), "syncfd_chunked_%zu", n_chunks);
if (size < n_chunks * 4096) { skip(scname); continue; }
size_t align = std::max(dev0->min_imported_host_pointer_alignment,
dev1->min_imported_host_pointer_alignment);
size_t chunk_data = size / n_chunks;
size_t chunk_aligned = (chunk_data + align - 1) & ~(align - 1);
vk_shared_staging stg;
if (!stg.alloc(dev0, dev1, chunk_aligned * n_chunks)) {
std::cerr << " syncfd_chunked_" << n_chunks << " : SKIPPED (import failed)" << std::endl;
stg.free_resources();
continue;
}
std::vector<double> times;
bool run_ok = true;
for (size_t iter = 0; iter < num_it + warmup && run_ok; iter++) {
auto begin = std::chrono::high_resolution_clock::now();
// Create per-chunk exportable semaphores
std::vector<vk::Semaphore> sems_dev0(n_chunks);
for (size_t c = 0; c < n_chunks; c++) {
vk::ExportSemaphoreCreateInfo esci{};
esci.handleTypes = vk::ExternalSemaphoreHandleTypeFlagBits::eSyncFd;
vk::SemaphoreCreateInfo sci{};
sci.setPNext(&esci);
sems_dev0[c] = dev0->device.createSemaphore(sci);
}
// Submit all hop1s with per-chunk signal
for (size_t c = 0; c < n_chunks; c++) {
size_t off_src = c * chunk_data;
size_t off_stg = c * chunk_aligned;
size_t csz = (c == n_chunks - 1) ? (size - c * chunk_data) : chunk_data;
vk_context subctx = ggml_vk_create_temporary_context(dev0->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dev0, subctx);
ggml_vk_buffer_copy_async(subctx, stg.buf_dev0, off_stg, buf_src, off_src, csz);
subctx->s->signal_semaphores.push_back({ sems_dev0[c], 0 });
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, {});
}
// Export all sync_fds and import on dev1, submit hop2s
for (size_t c = 0; c < n_chunks && run_ok; c++) {
size_t off_dst = c * chunk_data;
size_t off_stg = c * chunk_aligned;
size_t csz = (c == n_chunks - 1) ? (size - c * chunk_data) : chunk_data;
int sync_fd = -1;
try {
vk::SemaphoreGetFdInfoKHR gi{};
gi.semaphore = sems_dev0[c];
gi.handleType = vk::ExternalSemaphoreHandleTypeFlagBits::eSyncFd;
sync_fd = dev0->device.getSemaphoreFdKHR(gi);
} catch (vk::SystemError& e) {
char nm[48]; snprintf(nm, sizeof(nm), "syncfd_chunked_%zu", n_chunks);
std::cerr << " " << nm << " : SKIPPED (export: " << e.what() << ")" << std::endl;
run_ok = false; break;
}
vk::Semaphore sem_dev1 = dev1->device.createSemaphore({});
try {
vk::ImportSemaphoreFdInfoKHR ii{};
ii.semaphore = sem_dev1;
ii.handleType = vk::ExternalSemaphoreHandleTypeFlagBits::eSyncFd;
ii.flags = vk::SemaphoreImportFlagBits::eTemporary;
ii.fd = sync_fd;
dev1->device.importSemaphoreFdKHR(ii);
} catch (vk::SystemError& e) {
char nm[48]; snprintf(nm, sizeof(nm), "syncfd_chunked_%zu", n_chunks);
std::cerr << " " << nm << " : SKIPPED (import: " << e.what() << ")" << std::endl;
dev1->device.destroySemaphore(sem_dev1);
close(sync_fd);
run_ok = false; break;
}
vk_context subctx = ggml_vk_create_temporary_context(dev1->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dev1, subctx);
subctx->s->wait_semaphores.push_back({ sem_dev1, 0 });
ggml_vk_buffer_copy_async(subctx, buf_dst, off_dst, stg.buf_dev1, off_stg, csz);
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, (c == n_chunks - 1) ? dev1->fence : vk::Fence{});
dev1->device.destroySemaphore(sem_dev1);
}
if (run_ok) {
VK_CHECK(dev1->device.waitForFences({ dev1->fence }, true, UINT64_MAX), "syncfd_chunked final");
dev1->device.resetFences({ dev1->fence });
}
for (size_t c = 0; c < n_chunks; c++) dev0->device.destroySemaphore(sems_dev0[c]);
auto end = std::chrono::high_resolution_clock::now();
if (run_ok && iter >= warmup) times.push_back(std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() / 1000.0);
}
if (run_ok) {
char name[48];
snprintf(name, sizeof(name), "syncfd_chunked_%zu", n_chunks);
record(name, size, times);
}
stg.free_resources();
}
}
#endif
}
ggml_vk_destroy_buffer(buf_src);
ggml_vk_destroy_buffer(buf_dst);
ggml_vk_destroy_buffer(staging_src);
ggml_vk_destroy_buffer(staging_dst);
}
static void ggml_vk_test_cross_device_copy(ggml_backend_vk_context * ctx) {
ggml_vk_instance_init();
const size_t n_devices = vk_instance.device_indices.size();
if (n_devices < 2) {
std::cerr << "COPY TEST: Need at least 2 Vulkan devices, found " << n_devices << std::endl;
return;
}
// List devices
std::cerr << "\n=== Vulkan Devices ===" << std::endl;
std::vector<vk_device> devices(n_devices);
for (size_t i = 0; i < n_devices; i++) {
devices[i] = ggml_vk_get_device(i);
std::cerr << " [" << i << "] " << devices[i]->name << std::endl;
}
const std::vector<size_t> test_sizes = {
4096, // 4 KB
256 * 1024, // 256 KB
1 * 1024 * 1024, // 1 MB
16 * 1024 * 1024, // 16 MB
64 * 1024 * 1024, // 64 MB
256 * 1024 * 1024, // 256 MB
};
// Collect results: results[pair_label][method_name] = vector of vk_copy_result (one per size)
struct pair_results {
std::string label;
std::map<std::string, std::vector<vk_copy_result>> methods;
};
std::vector<pair_results> all_results;
// Run benchmarks for all ordered pairs
for (size_t i = 0; i < n_devices; i++) {
for (size_t j = 0; j < n_devices; j++) {
if (i == j) continue;
std::string label = devices[i]->name + " -> " + devices[j]->name;
std::cerr << "\n\n=== " << label << " ===" << std::endl;
pair_results pr;
pr.label = label;
ggml_vk_bench_pair(devices[i], devices[j], test_sizes, pr.methods);
all_results.push_back(std::move(pr));
}
}
// Output markdown tables: one table per method
// Collect all method names
std::vector<std::string> method_order;
if (!all_results.empty()) {
// Use first pair's method order as canonical
for (auto & [method, _] : all_results[0].methods) {
method_order.push_back(method);
}
// Add any methods from other pairs not in the first
for (auto & pr : all_results) {
for (auto & [method, _] : pr.methods) {
if (std::find(method_order.begin(), method_order.end(), method) == method_order.end()) {
method_order.push_back(method);
}
}
}
}
std::cerr << "\n\n# Cross-Device Copy Benchmark Results\n" << std::endl;
for (auto & method : method_order) {
std::cerr << "## " << method << "\n" << std::endl;
// Header: | Direction | 4KB | 256KB | ... |
std::cerr << "| Direction |";
for (size_t s : test_sizes) {
if (s < 1024 * 1024) {
std::cerr << " " << s / 1024 << " KB |";
} else {
std::cerr << " " << s / (1024 * 1024) << " MB |";
}
}
std::cerr << std::endl;
// Separator
std::cerr << "|---|";
for (size_t s = 0; s < test_sizes.size(); s++) {
std::cerr << "---|";
GGML_UNUSED(s);
}
std::cerr << std::endl;
// Data rows
for (auto & pr : all_results) {
std::cerr << "| " << pr.label << " |";
auto it = pr.methods.find(method);
if (it != pr.methods.end() && it->second.size() == test_sizes.size()) {
for (auto & r : it->second) {
if (r.ms < 0) {
std::cerr << " - |";
} else {
std::cerr << " " << std::fixed << std::setprecision(1) << r.ms << " ms (" << std::setprecision(1) << r.gbps << " GB/s) |";
}
}
} else {
for (size_t s = 0; s < test_sizes.size(); s++) {
std::cerr << " - |";
GGML_UNUSED(s);
}
}
std::cerr << std::endl;
}
std::cerr << std::endl;
}
GGML_ABORT("GGML_VULKAN_COPY_TESTS completed");
GGML_UNUSED(ctx);
}
#endif
static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx, vk_context subctx) {
#if defined(GGML_VULKAN_COPY_TESTS)
ggml_vk_test_cross_device_copy(ctx);
#endif
#if defined(GGML_VULKAN_RUN_TESTS)
const std::vector<size_t> vals {
512, 512, 128,
@@ -13521,8 +14179,6 @@ static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = {
/* .memset_tensor = */ ggml_backend_vk_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_vk_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_vk_buffer_get_tensor,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ ggml_backend_vk_buffer_cpy_tensor,
/* .clear = */ ggml_backend_vk_buffer_clear,
/* .reset = */ NULL,
@@ -14981,8 +15637,6 @@ static ggml_backend_i ggml_backend_vk_interface = {
/* .free = */ ggml_backend_vk_free,
/* .set_tensor_async = */ ggml_backend_vk_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_vk_get_tensor_async,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ ggml_backend_vk_cpy_tensor_async,
/* .synchronize = */ ggml_backend_vk_synchronize,
/* .graph_plan_create = */ NULL,

View File

@@ -6,8 +6,8 @@
#define MAT_VEC_FUSION_FLAGS_SCALE1 0x8
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
#if defined(A_TYPEV4)
layout (binding = 0) readonly buffer AV4 {A_TYPEV4 data_a_v4[];};
#if defined(A_TYPE_VEC4)
layout (binding = 0) readonly buffer AV4 {A_TYPE_VEC4 data_a_v4[];};
#endif
#if defined(A_TYPE_PACKED16)
layout (binding = 0) readonly buffer A_PACKED16 {A_TYPE_PACKED16 data_a_packed16[];};
@@ -17,11 +17,11 @@ layout (binding = 0) readonly buffer A_PACKED32 {A_TYPE_PACKED32 data_a_packed32
#endif
layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
#ifdef B_TYPEV2
layout (binding = 1) readonly buffer BV2 {B_TYPEV2 data_b_v2[];};
#ifdef B_TYPE_VEC2
layout (binding = 1) readonly buffer BV2 {B_TYPE_VEC2 data_b_v2[];};
#endif
#ifdef B_TYPEV4
layout (binding = 1) readonly buffer BV4 {B_TYPEV4 data_b_v4[];};
#ifdef B_TYPE_VEC4
layout (binding = 1) readonly buffer BV4 {B_TYPE_VEC4 data_b_v4[];};
#endif
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};

View File

@@ -41,7 +41,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
const vec4 qs_u32_4 = vec4(unpack8((qs_u32 >> 4) & 0x03030303));
const vec4 qs_u32_6 = vec4(unpack8((qs_u32 >> 6) & 0x03030303));
const FLOAT_TYPEV2 dm = vec2(data_a[ib0 + i].dm);
const FLOAT_TYPE_VEC2 dm = vec2(data_a[ib0 + i].dm);
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
vec2 b0 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 0]);

View File

@@ -14,7 +14,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint v_im,
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
const uint ib0 = a_offset + (first_row+n)*num_blocks_per_row;
const FLOAT_TYPEV2 dm = FLOAT_TYPEV2(data_a[ib0 + i].dm);
const FLOAT_TYPE_VEC2 dm = FLOAT_TYPE_VEC2(data_a[ib0 + i].dm);
const uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im ];
const uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2];

View File

@@ -14,7 +14,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint v_im,
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
const uint ib0 = a_offset + (first_row+n)*num_blocks_per_row;
const FLOAT_TYPEV2 dm = FLOAT_TYPEV2(data_a[ib0 + i].dm);
const FLOAT_TYPE_VEC2 dm = FLOAT_TYPE_VEC2(data_a[ib0 + i].dm);
const uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im ];
const uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2];

View File

@@ -11,8 +11,8 @@ FLOAT_TYPE get_dm(uint ib) {
#endif
#if defined(DATA_A_Q4_1) || defined(DATA_A_Q5_1)
FLOAT_TYPEV2 get_dm(uint ib) {
return FLOAT_TYPEV2(data_a_packed32[ib].dm);
FLOAT_TYPE_VEC2 get_dm(uint ib) {
return FLOAT_TYPE_VEC2(data_a_packed32[ib].dm);
}
#endif
@@ -23,9 +23,9 @@ FLOAT_TYPE get_dm(uint ib) {
#endif
#if defined(DATA_A_Q2_K)
FLOAT_TYPEV2 get_dm(uint ib) {
FLOAT_TYPE_VEC2 get_dm(uint ib) {
const uint ib_k = ib / 8;
return FLOAT_TYPEV2(data_a_packed32[ib_k].dm);
return FLOAT_TYPE_VEC2(data_a_packed32[ib_k].dm);
}
#endif
@@ -304,7 +304,7 @@ vec2 get_dm_scale(uint ib, uint iqs) {
(data_a[ib_k].scales[is+4] >> 4) | ((data_a[ib_k].scales[is ] & 0xC0) >> 2));
}
return FLOAT_TYPEV2(data_a_packed32[ib_k].dm) * FLOAT_TYPEV2(scale_dm);
return FLOAT_TYPE_VEC2(data_a_packed32[ib_k].dm) * FLOAT_TYPE_VEC2(scale_dm);
}
FLOAT_TYPE mmvq_dot_product(const uint ib_a, const uint iqs) {
@@ -422,7 +422,7 @@ vec2 get_dm(uint ib, uint iqs) {
const float dl = d * float(2 * bitfieldExtract(qh, 12, 3) + 1);
// the -1 cancels out the bias in iq1s_grid_gpu
return FLOAT_TYPEV2(dl, dl * (delta - 1));
return FLOAT_TYPE_VEC2(dl, dl * (delta - 1));
}
FLOAT_TYPE mmvq_dot_product(const uint ib_a, const uint iqs) {

Some files were not shown because too many files have changed in this diff Show More