Compare commits

..

16 Commits

Author SHA1 Message Date
Ruben Ortlam
4cabbe36e0 state 2026-04-09 13:00:31 +02:00
Ruben Ortlam
9f001cae27 state 2026-04-09 12:51:43 +02:00
Ruben Ortlam
88335c0490 state 2026-04-09 12:39:51 +02:00
Ruben Ortlam
204023c897 state 2026-04-09 12:36:15 +02:00
Ruben Ortlam
d88d722fc1 state 2026-04-09 12:32:08 +02:00
Ruben Ortlam
96d9516329 state 2026-04-09 12:25:27 +02:00
Ruben Ortlam
8a108eddb4 state 2026-04-09 12:05:15 +02:00
Ruben Ortlam
47dde34e00 state 2026-04-09 11:58:46 +02:00
Ruben Ortlam
8d0e158076 state 2026-04-09 11:51:39 +02:00
Ruben Ortlam
aade0f81dd state 2026-04-09 11:42:50 +02:00
Ruben Ortlam
700270239d state 2026-04-09 11:24:21 +02:00
Ruben Ortlam
ddaafa3dc1 state 2026-04-09 11:11:17 +02:00
Ruben Ortlam
e5e0be0add state 2026-04-09 11:00:36 +02:00
Ruben Ortlam
3c4eae7dc9 state 2026-04-09 07:50:05 +02:00
Ruben Ortlam
7e2799c8c9 state 2026-04-09 07:40:02 +02:00
Ruben Ortlam
cd0722594a state 2026-04-09 07:25:33 +02:00
328 changed files with 15516 additions and 26078 deletions

8
.github/labeler.yml vendored
View File

@@ -73,18 +73,10 @@ android:
- changed-files:
- any-glob-to-any-file:
- examples/llama.android/**
server/webui:
- changed-files:
- any-glob-to-any-file:
- tools/server/webui/**
- tools/server/public/**
server:
- changed-files:
- any-glob-to-any-file:
- tools/server/**
ggml:
- changed-files:
- any-glob-to-any-file:

View File

@@ -17,7 +17,7 @@ jobs:
steps:
- uses: actions/stale@v10
with:
exempt-issue-labels: "refactoring,help wanted,good first issue,research 🔬,bug,roadmap,security"
exempt-issue-labels: "refactoring,help wanted,good first issue,research 🔬,bug,roadmap"
days-before-issue-stale: 30
days-before-issue-close: 14
stale-issue-label: "stale"

View File

@@ -1,17 +0,0 @@
set( CMAKE_SYSTEM_NAME Linux )
set( CMAKE_SYSTEM_PROCESSOR arm64 )
set( target aarch64-linux-gnu )
set( CMAKE_C_COMPILER clang )
set( CMAKE_CXX_COMPILER clang++ )
set( CMAKE_C_COMPILER_TARGET ${target} )
set( CMAKE_CXX_COMPILER_TARGET ${target} )
set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )

View File

@@ -291,16 +291,14 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
hf_tag = "default";
}
const bool offline = params.offline;
std::string model_endpoint = get_model_endpoint();
auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";
// prepare local path for caching
auto preset_fname = clean_file_name(hf_repo + "_preset.ini");
auto preset_path = fs_get_cache_file(preset_fname);
common_download_opts opts;
opts.bearer_token = params.hf_token;
opts.offline = params.offline;
const int status = common_download_file_single(preset_url, preset_path, opts);
const int status = common_download_file_single(preset_url, preset_path, params.hf_token, offline);
const bool has_preset = status >= 200 && status < 400;
// remote preset is optional, so we don't error out if not found
@@ -343,10 +341,10 @@ static handle_model_result common_params_handle_model(struct common_params_model
model.hf_file = model.path;
model.path = "";
}
common_download_opts opts;
opts.bearer_token = bearer_token;
common_download_model_opts opts;
opts.download_mmproj = true;
opts.offline = offline;
auto download_result = common_download_model(model, opts, true);
auto download_result = common_download_model(model, bearer_token, opts);
if (download_result.model_path.empty()) {
LOG_ERR("error: failed to download model from Hugging Face\n");
@@ -367,10 +365,9 @@ static handle_model_result common_params_handle_model(struct common_params_model
model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
}
common_download_opts opts;
opts.bearer_token = bearer_token;
common_download_model_opts opts;
opts.offline = offline;
auto download_result = common_download_model(model, opts);
auto download_result = common_download_model(model, bearer_token, opts);
if (download_result.model_path.empty()) {
LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
exit(1);
@@ -2351,21 +2348,19 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_env("LLAMA_ARG_N_GPU_LAYERS"));
add_opt(common_arg(
{"-sm", "--split-mode"}, "{none,layer,row,tensor}",
{"-sm", "--split-mode"}, "{none,layer,row}",
"how to split the model across multiple GPUs, one of:\n"
"- none: use one GPU only\n"
"- layer (default): split layers and KV across GPUs (pipelined)\n"
"- row: split weight across GPUs by rows (parallelized)\n"
"- tensor: split weights and KV across GPUs (parallelized, EXPERIMENTAL)",
"- layer (default): split layers and KV across GPUs\n"
"- row: split rows across GPUs",
[](common_params & params, const std::string & value) {
if (value == "none") {
std::string arg_next = value;
if (arg_next == "none") {
params.split_mode = LLAMA_SPLIT_MODE_NONE;
} else if (value == "layer") {
} else if (arg_next == "layer") {
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
} else if (value == "row") {
} else if (arg_next == "row") {
params.split_mode = LLAMA_SPLIT_MODE_ROW;
} else if (value == "tensor") {
params.split_mode = LLAMA_SPLIT_MODE_TENSOR;
} else {
throw std::invalid_argument("invalid value");
}

View File

@@ -69,10 +69,6 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
auto schema = function.contains("parameters") ? function.at("parameters") : json::object();
builder.resolve_refs(schema);
});
if (has_response_format) {
auto schema = inputs.json_schema;
builder.resolve_refs(schema);
}
parser.build_grammar(builder, data.grammar_lazy);
});
@@ -336,36 +332,58 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
const auto & inputs = ctx.inputs;
bool force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
auto until_suffix = p.rule("until-suffix", p.until(arguments.value_suffix));
common_peg_parser tool_choice = p.choice();
foreach_function(inputs.tools, [&](const json & tool) {
const auto & func = tool.at("function");
std::string name = func.at("name");
auto params = func.contains("parameters") ? func.at("parameters") : json::object();
const auto & params = func.contains("parameters") ? func.at("parameters") : json::object();
const auto & properties = params.contains("properties") ? params.at("properties") : json::object();
std::set<std::string> required;
if (params.contains("required")) {
params.at("required").get_to(required);
}
auto schema_info = common_schema_info();
schema_info.resolve_refs(params);
// Build parser for each argument, separating required and optional
std::vector<common_peg_parser> required_parsers;
std::vector<common_peg_parser> optional_parsers;
for (const auto & [param_name, param_schema] : properties.items()) {
bool is_required = required.find(param_name) != required.end();
bool is_required = required.find(param_name) != required.end();
std::string type = "object";
if (param_schema.contains("type")) {
const auto & type_obj = param_schema.at("type");
if (type_obj.is_string()) {
type_obj.get_to(type);
} else if (type_obj.is_array()) {
// Handle nullable types like ["string", "null"]
for (const auto & t : type_obj) {
if (t.is_string() && t.get<std::string>() != "null") {
type = t.get<std::string>();
break;
}
}
} else if (type_obj.is_object()) {
if (type_obj.contains("type") && type_obj.at("type").is_string()) {
type_obj.at("type").get_to(type);
}
}
}
// Infer string type from enum values when type is unspecified
if (type == "object" && param_schema.contains("enum")) {
const auto & enum_vals = param_schema.at("enum");
if (enum_vals.is_array()) {
for (const auto & v : enum_vals) {
if (v.is_string()) {
type = "string";
break;
}
}
}
}
auto arg =
p.tool_arg(p.tool_arg_open(arguments.name_prefix + p.tool_arg_name(p.literal(param_name)) +
arguments.name_suffix) +
arguments.value_prefix +
(schema_info.resolves_to_string(param_schema) ?
p.tool_arg_string_value(p.schema(until_suffix,
(type == "string" ?
p.tool_arg_string_value(p.schema(p.until(arguments.value_suffix),
"tool-" + name + "-arg-" + param_name + "-schema",
param_schema, true)) :
p.tool_arg_json_value(p.schema(
@@ -396,7 +414,7 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
for (const auto & opt : optional_parsers) {
any_opt |= opt;
}
args_seq = args_seq + p.repeat(p.space() + any_opt, 0, -1);
args_seq = args_seq + p.repeat(p.space() + any_opt, 0, (int) optional_parsers.size());
}
if (!arguments.start.empty()) {

View File

@@ -865,10 +865,9 @@ static common_chat_params common_chat_params_init_ministral_3(const common_chat_
adjusted_messages.push_back(adjusted);
}
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
auto has_response_format = inputs.json_schema.is_object() && !inputs.json_schema.empty();
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
auto include_grammar = true;
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
auto include_grammar = true;
data.supports_thinking = true;
data.thinking_start_tag = "[THINK]";
@@ -888,7 +887,7 @@ static common_chat_params common_chat_params_init_ministral_3(const common_chat_
extract_reasoning ? p.optional("[THINK]" + p.reasoning(p.until("[/THINK]")) + "[/THINK]") : p.eps();
// Response format parser
if (has_response_format) {
if (inputs.json_schema.is_object() && !inputs.json_schema.empty()) {
// Ministral wants to emit json surrounded by code fences
return generation_prompt + (reasoning << "```json" << p.content(p.schema(p.json(), "response-format", inputs.json_schema)) << "```");
}
@@ -929,10 +928,6 @@ static common_chat_params common_chat_params_init_ministral_3(const common_chat_
auto schema = function.at("parameters");
builder.resolve_refs(schema);
});
if (has_response_format) {
auto schema = inputs.json_schema;
builder.resolve_refs(schema);
}
parser.build_grammar(builder, data.grammar_lazy);
});
@@ -1068,10 +1063,6 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
auto schema = function.at("parameters");
builder.resolve_refs(schema);
});
if (has_response_format) {
auto schema = inputs.json_schema;
builder.resolve_refs(schema);
}
parser.build_grammar(builder, data.grammar_lazy);
});
@@ -1091,18 +1082,8 @@ static common_chat_params common_chat_params_init_gemma4(const common_chat_templ
common_chat_params data;
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs);
if (inputs.add_generation_prompt && string_ends_with(data.prompt, "<turn|>\n")) {
// This may happen if the model generates content + tool_call, the
// template does not add the model's next turn and confuses the model
// from emitting its proper reasoning token sequence.
data.prompt += "<|turn>model\n";
}
data.format = COMMON_CHAT_FORMAT_PEG_GEMMA4;
data.supports_thinking = true;
data.thinking_start_tag = "<|channel>thought";
data.thinking_end_tag = "<channel|>";
data.supports_thinking = true;
data.preserved_tokens = {
"<|channel>",
@@ -1121,13 +1102,12 @@ static common_chat_params common_chat_params_init_gemma4(const common_chat_templ
auto start = p.rule("start", p.prefix(inputs.generation_prompt, "<|channel>"));
if (extract_reasoning) {
p.rule("thought", p.literal("<|channel>thought") + p.space() + p.reasoning(p.until("<channel|>")) + p.literal("<channel|>"));
p.rule("thought", p.literal("<|channel>thought\n") + p.reasoning(p.until("<channel|>")) + p.literal("<channel|>"));
} else {
p.rule("thought", p.content(p.literal("<|channel>thought") + p.space() + p.until("<channel|>") + p.literal("<channel|>")));
p.rule("thought", p.content(p.literal("<|channel>thought\n") + p.until("<channel|>") + p.literal("<channel|>")));
}
auto consume_empty_channels = p.gbnf(p.zero_or_more(p.literal("<|channel>") + p.negate(p.literal("thought"))), "");
auto thought = (p.peek(p.literal("<|channel>")) + consume_empty_channels + p.ref("thought")) | p.negate(p.literal("<|channel>"));
auto thought = (p.peek(p.literal("<|channel>")) + p.ref("thought")) | p.negate(p.literal("<|channel>"));
if (has_response_format) {
auto response_format = p.literal("```json") <<
@@ -1144,7 +1124,7 @@ static common_chat_params common_chat_params_init_gemma4(const common_chat_templ
p.rule("gemma4-bool", p.json_bool());
p.rule("gemma4-null", p.json_null());
p.rule("gemma4-number", p.json_number());
p.rule("gemma4-dict-key", p.rule("gemma4-dict-key-name", p.chars("[^:}]", 1, -1)) + p.literal(":"));
p.rule("gemma4-dict-key", p.rule("gemma4-dict-key-name", p.until(":")) + p.literal(":"));
p.rule("gemma4-dict-kv", p.ref("gemma4-dict-key") + p.space() + p.ref("gemma4-value"));
p.rule("gemma4-dict", [&]() {
auto ws = p.space();
@@ -1191,16 +1171,12 @@ static common_chat_params common_chat_params_init_gemma4(const common_chat_templ
/* max = */ inputs.parallel_tool_calls ? -1 : 1
));
auto scan_to_toolcall = p.rule("scan-to-toolcall", p.until("<|tool_call>"));
auto content = p.rule("content", p.content(p.until_one_of({"<|channel>", "<channel|>", "<|tool_call>"})));
auto content = p.rule("content", p.content(p.until_one_of({"<|channel>", "<|tool_call>"})));
auto message = p.rule("message", thought + content);
return start + p.zero_or_more(message) + scan_to_toolcall + tool_call;
return start + p.zero_or_more(message) + tool_call;
}
// Gemma 4 may emit an extra <|channel>thought\n<channel|> at the end of the content. It may
// also emit a single trailing <channel|> token. Consume all complete reasoning blocks and
// then stop at the first unmatched <channel|> token.
auto content = p.rule("content", p.content(p.until_one_of({"<|channel>", "<channel|>"})));
auto content = p.rule("content", p.content(p.until("<|channel>")));
auto message = p.rule("message", thought + content);
return start + p.one_or_more(message);
});
@@ -1215,10 +1191,6 @@ static common_chat_params common_chat_params_init_gemma4(const common_chat_templ
auto schema = function.at("parameters");
builder.resolve_refs(schema);
});
if (has_response_format) {
auto schema = inputs.json_schema;
builder.resolve_refs(schema);
}
parser.build_grammar(builder, data.grammar_lazy);
});
@@ -1669,173 +1641,6 @@ static common_chat_params common_chat_params_init_gigachat_v3(
return data;
}
static common_chat_params common_chat_params_init_deepseek_v3_2(const common_chat_template & tmpl,
const autoparser::generation_params & inputs) {
common_chat_params data;
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.supports_thinking = true;
data.thinking_start_tag = "<think>";
data.thinking_end_tag = "</think>";
data.preserved_tokens = {
"DSML",
"<think>",
"</think>",
};
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
auto has_response_format = !inputs.json_schema.is_null() && inputs.json_schema.is_object();
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
auto include_grammar = has_response_format || (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE);
const std::string DSML = "DSML";
const std::string THINK_START = "<think>";
const std::string THINK_END = "</think>";
const std::string FC_START = "<" + DSML + "function_calls>";
const std::string FC_END = "</" + DSML + "function_calls>";
const std::string INVOKE_START = "<" + DSML + "invoke";
const std::string INVOKE_END = "</" + DSML + "invoke>";
const std::string PARAM_START = "<" + DSML + "parameter";
const std::string PARAM_END = "</" + DSML + "parameter>";
auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
auto generation_prompt = p.prefix(inputs.generation_prompt, THINK_START);
auto end = p.end();
auto reasoning = p.eps();
if (extract_reasoning && inputs.enable_thinking) {
reasoning = p.optional(THINK_START + p.reasoning(p.until(THINK_END)) + THINK_END);
} else if (extract_reasoning) {
// Thinking disabled but reasoning extraction requested: the generation prompt
// contains an empty <think></think> pair that must still be consumed.
reasoning = p.optional(p.literal(THINK_START) + p.until(THINK_END) + p.literal(THINK_END));
}
if (has_response_format) {
auto response_format = p.rule("response-format",
p.literal("```json") + p.space() +
p.content(p.schema(p.json(), "response-format-schema", inputs.json_schema)) +
p.space() + p.literal("```"));
return generation_prompt + reasoning + response_format + end;
}
if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
return generation_prompt + reasoning + p.content(p.rest()) + end;
}
auto tool_choice = p.choice();
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool.at("function");
std::string name = function.at("name");
auto params = function.contains("parameters") ? function.at("parameters") : json::object();
const auto & props = params.contains("properties") ? params.at("properties") : json::object();
std::set<std::string> required;
if (params.contains("required")) {
params.at("required").get_to(required);
}
auto schema_info = common_schema_info();
schema_info.resolve_refs(params);
std::vector<common_peg_parser> required_parsers;
std::vector<common_peg_parser> optional_parsers;
for (const auto & [param_name, param_schema] : props.items()) {
bool is_required = required.find(param_name) != required.end();
bool is_string = schema_info.resolves_to_string(param_schema);
auto arg = p.tool_arg(
p.tool_arg_open(
p.literal(PARAM_START + " name=\"") +
p.tool_arg_name(p.literal(param_name)) +
p.literal("\" string=\"" + std::string(is_string ? "true" : "false") + "\">")) +
(is_string
? p.tool_arg_string_value(p.until(PARAM_END))
: p.tool_arg_json_value(p.schema(p.json(),
"tool-" + name + "-arg-" + param_name + "-schema",
param_schema, false))) +
p.tool_arg_close(p.literal(PARAM_END)));
auto named_arg = p.rule("tool-" + name + "-arg-" + param_name, arg);
if (is_required) {
required_parsers.push_back(named_arg);
} else {
optional_parsers.push_back(named_arg);
}
}
common_peg_parser args_seq = p.eps();
for (size_t i = 0; i < required_parsers.size(); i++) {
if (i > 0) {
args_seq = args_seq + p.space();
}
args_seq = args_seq + required_parsers[i];
}
if (!optional_parsers.empty()) {
common_peg_parser any_opt = p.choice();
for (const auto & opt : optional_parsers) {
any_opt |= opt;
}
args_seq = args_seq + p.repeat(p.space() + any_opt, 0, -1);
}
common_peg_parser invoke_body = args_seq;
auto func_parser = p.tool(
p.tool_open(p.literal(INVOKE_START + " name=\"") +
p.tool_name(p.literal(name)) + p.literal("\">\n")) +
invoke_body + p.space() +
p.tool_close(p.literal(INVOKE_END)));
tool_choice |= p.rule("tool-" + name, func_parser);
});
auto require_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
common_peg_parser tool_calls = p.eps();
if (inputs.parallel_tool_calls) {
tool_calls = p.trigger_rule("tool-call",
p.literal(FC_START) + p.space() + tool_choice +
p.zero_or_more(p.space() + tool_choice) + p.space() + p.literal(FC_END));
} else {
tool_calls = p.trigger_rule("tool-call",
p.literal(FC_START) + p.space() + tool_choice + p.space() + p.literal(FC_END));
}
if (!require_tools) {
tool_calls = p.optional(tool_calls);
}
auto content_before_tools = p.content(p.until(FC_START));
return generation_prompt + reasoning + content_before_tools + tool_calls + end;
});
data.parser = parser.save();
if (include_grammar) {
data.grammar_lazy = !(has_response_format || (has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED));
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool.at("function");
auto schema = function.contains("parameters") ? function.at("parameters") : json::object();
builder.resolve_refs(schema);
});
if (has_response_format) {
auto schema = inputs.json_schema;
builder.resolve_refs(schema);
}
parser.build_grammar(builder, data.grammar_lazy);
});
data.grammar_triggers = {
{ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, FC_START },
};
}
return data;
}
namespace workaround {
static void map_developer_role_to_system(json & messages) {
@@ -2107,23 +1912,9 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
return common_chat_params_init_gigachat_v3(tmpl, params);
}
// DeepSeek V3.2 format detection: template defines dsml_token and uses it for tool calls.
// The template source contains the token as a variable assignment, not as a literal in markup.
if (src.find("dsml_token") != std::string::npos &&
src.find("function_calls") != std::string::npos &&
src.find("DSML") != std::string::npos) {
LOG_DBG("Using specialized template: DeepSeek V3.2\n");
return common_chat_params_init_deepseek_v3_2(tmpl, params);
}
// Gemma4 format detection
if (src.find("'<|tool_call>call:'") != std::string::npos) {
if (src.find("{#- OpenAI Chat Completions:") == std::string::npos) {
// apply workarounds if using the older gemma4 templates
LOG_WRN("%s: detected an outdated gemma4 chat template, applying compatibility workarounds. "
"Consider updating to the official template.\n", __func__);
workaround::convert_tool_responses_gemma4(params.messages);
}
workaround::convert_tool_responses_gemma4(params.messages);
return common_chat_params_init_gemma4(tmpl, params);
}
@@ -2172,7 +1963,7 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
params.add_generation_prompt = true;
std::string gen_prompt = common_chat_template_direct_apply_impl(tmpl, params);
auto diff = calculate_diff_split(no_gen_prompt, gen_prompt);
params.generation_prompt = diff.right + diff.suffix;
params.generation_prompt = diff.right;
params.add_generation_prompt = inputs.add_generation_prompt;

View File

@@ -114,7 +114,7 @@ std::pair<std::string, std::string> common_download_split_repo_tag(const std::st
return {hf_repo, tag};
}
class ProgressBar : public common_download_callback {
class ProgressBar {
static inline std::mutex mutex;
static inline std::map<const ProgressBar *, int> lines;
static inline int max_line = 0;
@@ -138,11 +138,7 @@ class ProgressBar : public common_download_callback {
}
public:
ProgressBar() = default;
void on_start(const common_download_progress & p) override {
filename = p.url;
ProgressBar(const std::string & url = "") : filename(url) {
if (auto pos = filename.rfind('/'); pos != std::string::npos) {
filename = filename.substr(pos + 1);
}
@@ -160,13 +156,13 @@ public:
}
}
void on_done(const common_download_progress &, bool) override {
~ProgressBar() {
std::lock_guard<std::mutex> lock(mutex);
cleanup(this);
}
void on_update(const common_download_progress & p) override {
if (!p.total || !is_output_a_tty()) {
void update(size_t current, size_t total) {
if (!total || !is_output_a_tty()) {
return;
}
@@ -178,17 +174,17 @@ public:
}
int lines_up = max_line - lines[this];
size_t bar = (55 - len) * 2;
size_t pct = (100 * p.downloaded) / p.total;
size_t pos = (bar * p.downloaded) / p.total;
size_t bar = 55 - len;
size_t pct = (100 * current) / total;
size_t pos = (bar * current) / total;
if (lines_up > 0) {
std::cout << "\033[" << lines_up << "A";
}
std::cout << '\r' << "Downloading " << filename << " ";
for (size_t i = 0; i < bar; i += 2) {
std::cout << (i + 1 < pos ? "" : (i < pos ? "" : " "));
for (size_t i = 0; i < bar; ++i) {
std::cout << (i < pos ? "" : " ");
}
std::cout << std::setw(4) << pct << "%\033[K";
@@ -197,7 +193,7 @@ public:
}
std::cout << '\r' << std::flush;
if (p.downloaded == p.total) {
if (current == total) {
cleanup(this);
}
}
@@ -210,8 +206,8 @@ static bool common_pull_file(httplib::Client & cli,
const std::string & resolve_path,
const std::string & path_tmp,
bool supports_ranges,
common_download_progress & p,
common_download_callback * callback) {
size_t existing_size,
size_t & total_size) {
std::ofstream ofs(path_tmp, std::ios::binary | std::ios::app);
if (!ofs.is_open()) {
LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path_tmp.c_str());
@@ -219,27 +215,29 @@ static bool common_pull_file(httplib::Client & cli,
}
httplib::Headers headers;
if (supports_ranges && p.downloaded > 0) {
headers.emplace("Range", "bytes=" + std::to_string(p.downloaded) + "-");
if (supports_ranges && existing_size > 0) {
headers.emplace("Range", "bytes=" + std::to_string(existing_size) + "-");
}
const char * func = __func__; // avoid __func__ inside a lambda
size_t downloaded = existing_size;
size_t progress_step = 0;
ProgressBar bar(resolve_path);
auto res = cli.Get(resolve_path, headers,
[&](const httplib::Response &response) {
if (p.downloaded > 0 && response.status != 206) {
if (existing_size > 0 && response.status != 206) {
LOG_WRN("%s: server did not respond with 206 Partial Content for a resume request. Status: %d\n", func, response.status);
return false;
}
if (p.downloaded == 0 && response.status != 200) {
if (existing_size == 0 && response.status != 200) {
LOG_WRN("%s: download received non-successful status code: %d\n", func, response.status);
return false;
}
if (p.total == 0 && response.has_header("Content-Length")) {
if (total_size == 0 && response.has_header("Content-Length")) {
try {
size_t content_length = std::stoull(response.get_header_value("Content-Length"));
p.total = p.downloaded + content_length;
total_size = existing_size + content_length;
} catch (const std::exception &e) {
LOG_WRN("%s: invalid Content-Length header: %s\n", func, e.what());
}
@@ -252,16 +250,11 @@ static bool common_pull_file(httplib::Client & cli,
LOG_ERR("%s: error writing to file: %s\n", func, path_tmp.c_str());
return false;
}
p.downloaded += len;
downloaded += len;
progress_step += len;
if (progress_step >= p.total / 1000 || p.downloaded == p.total) {
if (callback) {
callback->on_update(p);
if (callback->is_cancelled()) {
return false;
}
}
if (progress_step >= total_size / 1000 || downloaded == total_size) {
bar.update(downloaded, total_size);
progress_step = 0;
}
return true;
@@ -282,13 +275,28 @@ static bool common_pull_file(httplib::Client & cli,
// download one single file from remote URL to local path
// returns status code or -1 on error
static int common_download_file_single_online(const std::string & url,
const std::string & path,
const common_download_opts & opts,
bool skip_etag) {
static int common_download_file_single_online(const std::string & url,
const std::string & path,
const std::string & bearer_token,
const common_header_list & custom_headers,
bool skip_etag = false) {
static const int max_attempts = 3;
static const int retry_delay_seconds = 2;
auto [cli, parts] = common_http_client(url);
httplib::Headers headers;
for (const auto & h : custom_headers) {
headers.emplace(h.first, h.second);
}
if (headers.find("User-Agent") == headers.end()) {
headers.emplace("User-Agent", "llama-cpp/" + build_info);
}
if (!bearer_token.empty()) {
headers.emplace("Authorization", "Bearer " + bearer_token);
}
cli.set_default_headers(headers);
const bool file_exists = std::filesystem::exists(path);
if (file_exists && skip_etag) {
@@ -296,20 +304,6 @@ static int common_download_file_single_online(const std::string & url,
return 304; // 304 Not Modified - fake cached response
}
auto [cli, parts] = common_http_client(url);
httplib::Headers headers;
for (const auto & h : opts.headers) {
headers.emplace(h.first, h.second);
}
if (headers.find("User-Agent") == headers.end()) {
headers.emplace("User-Agent", "llama-cpp/" + build_info);
}
if (!opts.bearer_token.empty()) {
headers.emplace("Authorization", "Bearer " + opts.bearer_token);
}
cli.set_default_headers(headers);
std::string last_etag;
if (file_exists) {
last_etag = read_etag(path);
@@ -332,11 +326,10 @@ static int common_download_file_single_online(const std::string & url,
etag = head->get_header_value("ETag");
}
common_download_progress p;
p.url = url;
size_t total_size = 0;
if (head->has_header("Content-Length")) {
try {
p.total = std::stoull(head->get_header_value("Content-Length"));
total_size = std::stoull(head->get_header_value("Content-Length"));
} catch (const std::exception& e) {
LOG_WRN("%s: invalid Content-Length in HEAD response: %s\n", __func__, e.what());
}
@@ -364,21 +357,14 @@ static int common_download_file_single_online(const std::string & url,
{ // silent
std::error_code ec;
std::filesystem::create_directories(std::filesystem::path(path).parent_path(), ec);
std::filesystem::path p(path);
std::filesystem::create_directories(p.parent_path(), ec);
}
bool success = false;
const std::string path_temporary = path + ".downloadInProgress";
int delay = retry_delay_seconds;
if (opts.callback) {
opts.callback->on_start(p);
}
for (int i = 0; i < max_attempts; ++i) {
if (opts.callback && opts.callback->is_cancelled()) {
break;
}
if (i) {
LOG_WRN("%s: retrying after %d seconds...\n", __func__, delay);
std::this_thread::sleep_for(std::chrono::seconds(delay));
@@ -392,44 +378,28 @@ static int common_download_file_single_online(const std::string & url,
existing_size = std::filesystem::file_size(path_temporary);
} else if (remove(path_temporary.c_str()) != 0) {
LOG_ERR("%s: unable to delete file: %s\n", __func__, path_temporary.c_str());
break;
return -1;
}
}
p.downloaded = existing_size;
LOG_DBG("%s: downloading from %s to %s (etag:%s)...\n",
__func__, common_http_show_masked_url(parts).c_str(),
path_temporary.c_str(), etag.c_str());
if (common_pull_file(cli, parts.path, path_temporary, supports_ranges, p, opts.callback)) {
if (common_pull_file(cli, parts.path, path_temporary, supports_ranges, existing_size, total_size)) {
if (std::rename(path_temporary.c_str(), path.c_str()) != 0) {
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
break;
return -1;
}
if (!etag.empty() && !skip_etag) {
write_etag(path, etag);
}
success = true;
break;
return head->status;
}
}
if (opts.callback) {
opts.callback->on_done(p, success);
}
if (opts.callback && opts.callback->is_cancelled() &&
std::filesystem::exists(path_temporary)) {
if (remove(path_temporary.c_str()) != 0) {
LOG_ERR("%s: unable to delete temporary file: %s\n", __func__, path_temporary.c_str());
}
}
if (!success) {
LOG_ERR("%s: download failed after %d attempts\n", __func__, max_attempts);
return -1; // max attempts reached
}
return head->status;
LOG_ERR("%s: download failed after %d attempts\n", __func__, max_attempts);
return -1; // max attempts reached
}
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url,
@@ -468,15 +438,12 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string
int common_download_file_single(const std::string & url,
const std::string & path,
const common_download_opts & opts,
const std::string & bearer_token,
bool offline,
const common_header_list & headers,
bool skip_etag) {
if (!opts.offline) {
ProgressBar tty_cb;
common_download_opts online_opts = opts;
if (!online_opts.callback) {
online_opts.callback = &tty_cb;
}
return common_download_file_single_online(url, path, online_opts, skip_etag);
if (!offline) {
return common_download_file_single_online(url, path, bearer_token, headers, skip_etag);
}
if (!std::filesystem::exists(path)) {
@@ -485,16 +452,6 @@ int common_download_file_single(const std::string & url,
}
LOG_DBG("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
// notify the callback that the file was cached
if (opts.callback) {
common_download_progress p;
p.url = url;
p.cached = true;
opts.callback->on_start(p);
opts.callback->on_done(p, true);
}
return 304; // Not Modified - fake cached response
}
@@ -634,10 +591,6 @@ static hf_cache::hf_file find_best_model(const hf_cache::hf_files & files,
for (const auto & f : files) {
if (gguf_filename_is_model(f.path) &&
std::regex_search(f.path, pattern)) {
auto split = get_gguf_split_info(f.path);
if (split.count > 1 && split.index != 1) {
continue;
}
return f;
}
}
@@ -647,10 +600,6 @@ static hf_cache::hf_file find_best_model(const hf_cache::hf_files & files,
if (tag.empty()) {
for (const auto & f : files) {
if (gguf_filename_is_model(f.path)) {
auto split = get_gguf_split_info(f.path);
if (split.count > 1 && split.index != 1) {
continue;
}
return f;
}
}
@@ -669,21 +618,20 @@ static void list_available_gguf_files(const hf_cache::hf_files & files) {
}
struct hf_plan {
hf_cache::hf_file primary;
hf_cache::hf_files model_files;
hf_cache::hf_file mmproj;
};
static hf_plan get_hf_plan(const common_params_model & model,
const common_download_opts & opts,
bool download_mmproj) {
static hf_plan get_hf_plan(const common_params_model & model,
const std::string & token,
const common_download_model_opts & opts) {
hf_plan plan;
hf_cache::hf_files all;
auto [repo, tag] = common_download_split_repo_tag(model.hf_repo);
if (!opts.offline) {
all = hf_cache::get_repo_files(repo, opts.bearer_token);
all = hf_cache::get_repo_files(repo, token);
}
if (all.empty()) {
all = hf_cache::get_cached_files(repo);
@@ -715,10 +663,9 @@ static hf_plan get_hf_plan(const common_params_model & model,
}
}
plan.primary = primary;
plan.model_files = get_split_files(all, primary);
if (download_mmproj) {
if (opts.download_mmproj) {
plan.mmproj = find_best_mmproj(all, primary.path);
}
@@ -753,9 +700,10 @@ static std::vector<download_task> get_url_tasks(const common_params_model & mode
return tasks;
}
common_download_model_result common_download_model(const common_params_model & model,
const common_download_opts & opts,
bool download_mmproj) {
common_download_model_result common_download_model(const common_params_model & model,
const std::string & bearer_token,
const common_download_model_opts & opts,
const common_header_list & headers) {
common_download_model_result result;
std::vector<download_task> tasks;
hf_plan hf;
@@ -763,7 +711,7 @@ common_download_model_result common_download_model(const common_params_model &
bool is_hf = !model.hf_repo.empty();
if (is_hf) {
hf = get_hf_plan(model, opts, download_mmproj);
hf = get_hf_plan(model, bearer_token, opts);
for (const auto & f : hf.model_files) {
tasks.push_back({f.url, f.local_path});
}
@@ -784,8 +732,8 @@ common_download_model_result common_download_model(const common_params_model &
std::vector<std::future<bool>> futures;
for (const auto & task : tasks) {
futures.push_back(std::async(std::launch::async,
[&task, &opts, is_hf]() {
int status = common_download_file_single(task.url, task.path, opts, is_hf);
[&task, &bearer_token, offline = opts.offline, &headers, is_hf]() {
int status = common_download_file_single(task.url, task.path, bearer_token, offline, headers, is_hf);
return is_http_status_ok(status);
}
));
@@ -801,7 +749,7 @@ common_download_model_result common_download_model(const common_params_model &
for (const auto & f : hf.model_files) {
hf_cache::finalize_file(f);
}
result.model_path = hf.primary.final_path;
result.model_path = hf.model_files[0].final_path;
if (!hf.mmproj.path.empty()) {
result.mmproj_path = hf_cache::finalize_file(hf.mmproj);
@@ -921,9 +869,7 @@ std::string common_docker_resolve_model(const std::string & docker) {
std::string local_path = fs_get_cache_file(model_filename);
const std::string blob_url = url_prefix + "/blobs/" + gguf_digest;
common_download_opts opts;
opts.bearer_token = token;
const int http_status = common_download_file_single(blob_url, local_path, opts);
const int http_status = common_download_file_single(blob_url, local_path, token, false, {});
if (!is_http_status_ok(http_status)) {
throw std::runtime_error("Failed to download Docker Model");
}

View File

@@ -8,22 +8,6 @@ struct common_params_model;
using common_header = std::pair<std::string, std::string>;
using common_header_list = std::vector<common_header>;
struct common_download_progress {
std::string url;
size_t downloaded = 0;
size_t total = 0;
bool cached = false;
};
class common_download_callback {
public:
virtual ~common_download_callback() = default;
virtual void on_start(const common_download_progress & p) = 0;
virtual void on_update(const common_download_progress & p) = 0;
virtual void on_done(const common_download_progress & p, bool ok) = 0;
virtual bool is_cancelled() const { return false; }
};
struct common_remote_params {
common_header_list headers;
long timeout = 0; // in seconds, 0 means no timeout
@@ -47,12 +31,10 @@ struct common_cached_model_info {
}
};
// Options for common_download_model and common_download_file_single
struct common_download_opts {
std::string bearer_token;
common_header_list headers;
bool offline = false;
common_download_callback * callback = nullptr;
// Options for common_download_model
struct common_download_model_opts {
bool download_mmproj = false;
bool offline = false;
};
// Result of common_download_model
@@ -87,8 +69,9 @@ struct common_download_model_result {
// returns result with model_path and mmproj_path (empty on failure)
common_download_model_result common_download_model(
const common_params_model & model,
const common_download_opts & opts = {},
bool download_mmproj = false
const std::string & bearer_token,
const common_download_model_opts & opts = {},
const common_header_list & headers = {}
);
// returns list of cached models
@@ -99,7 +82,9 @@ std::vector<common_cached_model_info> common_list_cached_models();
// skip_etag: if true, don't read/write .etag files (for HF cache where filename is the hash)
int common_download_file_single(const std::string & url,
const std::string & path,
const common_download_opts & opts = {},
const std::string & bearer_token,
bool offline,
const common_header_list & headers = {},
bool skip_etag = false);
// resolve and download model from Docker registry

View File

@@ -251,23 +251,6 @@ value binary_expression::execute_impl(context & ctx) {
return res;
}
// Python-style string repetition
// TODO: support array/tuple repetition (e.g., [1, 2] * 3 → [1, 2, 1, 2, 1, 2])
if (op.value == "*" &&
((is_val<value_string>(left_val) && is_val<value_int>(right_val)) ||
(is_val<value_int>(left_val) && is_val<value_string>(right_val)))) {
const auto & str = is_val<value_string>(left_val) ? left_val->as_string() : right_val->as_string();
const int64_t repeat = is_val<value_int>(right_val) ? right_val->as_int() : left_val->as_int();
auto res = mk_val<value_string>();
if (repeat <= 0) {
return res;
}
for (int64_t i = 0; i < repeat; ++i) {
res->val_str = res->val_str.append(str);
}
return res;
}
// String membership
if (is_val<value_string>(left_val) && is_val<value_string>(right_val)) {
// case: "a" in "abc"

View File

@@ -1,5 +1,4 @@
#include "runtime.h"
#include "unicode.h"
#include "value.h"
// for converting from JSON to jinja values
@@ -155,83 +154,6 @@ static value test_compare_fn(const func_args & args) {
return mk_val<value_bool>(value_compare(args.get_pos(0), args.get_pos(1), op));
}
static void append_codepoint_as_ascii_json_escape(std::string & out, uint32_t codepoint) {
auto append_u16 = [&out](uint32_t value) {
char buf[8];
snprintf(buf, sizeof(buf), "\\u%04x", static_cast<unsigned int>(value));
out += buf;
};
if (codepoint <= 0xFFFF) {
append_u16(codepoint);
return;
}
codepoint -= 0x10000;
append_u16(0xD800 + ((codepoint >> 10) & 0x3FF));
append_u16(0xDC00 + (codepoint & 0x3FF));
}
static std::string json_ensure_ascii_preserving_format(const std::string & json_str) {
std::string output;
output.reserve(json_str.size());
bool in_string = false;
bool escaped = false;
for (size_t pos = 0; pos < json_str.size();) {
const char ch = json_str[pos];
if (!in_string) {
output.push_back(ch);
if (ch == '"') {
in_string = true;
}
++pos;
continue;
}
if (escaped) {
output.push_back(ch);
escaped = false;
++pos;
continue;
}
if (ch == '\\') {
output.push_back(ch);
escaped = true;
++pos;
continue;
}
if (ch == '"') {
output.push_back(ch);
in_string = false;
++pos;
continue;
}
const unsigned char uch = static_cast<unsigned char>(ch);
if (uch < 0x80) {
output.push_back(ch);
++pos;
continue;
}
auto parsed = common_parse_utf8_codepoint(json_str, pos);
if (parsed.status != utf8_parse_result::SUCCESS) {
output += "\\ufffd";
++pos;
continue;
}
append_codepoint_as_ascii_json_escape(output, parsed.codepoint);
pos += parsed.bytes_consumed;
}
return output;
}
static value tojson(const func_args & args) {
args.ensure_count(1, 5);
value val_ascii = args.get_kwarg_or_pos("ensure_ascii", 1);
@@ -247,17 +169,16 @@ static value tojson(const func_args & args) {
if (is_val<value_int>(val_indent)) {
indent = static_cast<int>(val_indent->as_int());
}
if (val_ascii->as_bool()) { // undefined == false
throw not_implemented_exception("tojson ensure_ascii=true not implemented");
}
if (val_sort->as_bool()) { // undefined == false
throw not_implemented_exception("tojson sort_keys=true not implemented");
}
const bool ensure_ascii = val_ascii->as_bool(); // undefined == false
auto separators = (is_val<value_array>(val_separators) ? val_separators : mk_val<value_array>())->as_array();
std::string item_sep = separators.size() > 0 ? separators[0]->as_string().str() : (indent < 0 ? ", " : ",");
std::string key_sep = separators.size() > 1 ? separators[1]->as_string().str() : ": ";
std::string json_str = value_to_json(args.get_pos(0), indent, item_sep, key_sep);
if (ensure_ascii) {
json_str = json_ensure_ascii_preserving_format(json_str);
}
return mk_val<value_string>(json_str);
}
@@ -539,10 +460,6 @@ const func_builtins & value_int_t::get_builtins() const {
int64_t val = args.get_pos(0)->as_int();
return mk_val<value_int>(val < 0 ? -val : val);
}},
{"int", [](const func_args & args) -> value {
args.ensure_vals<value_int>();
return mk_val<value_int>(args.get_pos(0)->as_int());
}},
{"float", [](const func_args & args) -> value {
args.ensure_vals<value_int>();
double val = static_cast<double>(args.get_pos(0)->as_int());
@@ -569,10 +486,6 @@ const func_builtins & value_float_t::get_builtins() const {
int64_t val = static_cast<int64_t>(args.get_pos(0)->as_float());
return mk_val<value_int>(val);
}},
{"float", [](const func_args & args) -> value {
args.ensure_vals<value_float>();
return mk_val<value_float>(args.get_pos(0)->as_float());
}},
{"safe", tojson},
{"string", tojson},
{"tojson", tojson},

View File

@@ -890,10 +890,6 @@ struct parser_executor {
}
return result;
}
common_peg_parse_result operator()(const common_peg_gbnf_parser & p) {
return arena.parse(p.child, ctx, start_pos);
}
};
common_peg_parse_result common_peg_arena::parse(common_peg_parse_context & ctx, size_t start) const {
@@ -961,8 +957,7 @@ void common_peg_arena::resolve_refs() {
std::is_same_v<T, common_peg_and_parser> ||
std::is_same_v<T, common_peg_not_parser> ||
std::is_same_v<T, common_peg_tag_parser> ||
std::is_same_v<T, common_peg_atomic_parser> ||
std::is_same_v<T, common_peg_gbnf_parser>) {
std::is_same_v<T, common_peg_atomic_parser>) {
p.child = resolve_ref(p.child);
} else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
p.child = resolve_ref(p.child);
@@ -1041,8 +1036,6 @@ std::string common_peg_arena::dump_impl(common_peg_parser_id
return "Not(" + dump_impl(p.child, visited) + ")";
} else if constexpr (std::is_same_v<T, common_peg_atomic_parser>) {
return "Atomic(" + dump_impl(p.child, visited) + ")";
} else if constexpr (std::is_same_v<T, common_peg_gbnf_parser>) {
return "Gbnf(" + p.grammar + ", " + dump_impl(p.child, visited) + ")";
} else if constexpr (std::is_same_v<T, common_peg_any_parser>) {
return "Any";
} else if constexpr (std::is_same_v<T, common_peg_space_parser>) {
@@ -1572,7 +1565,6 @@ static std::unordered_set<std::string> collect_reachable_rules(
std::is_same_v<T, common_peg_not_parser> ||
std::is_same_v<T, common_peg_tag_parser> ||
std::is_same_v<T, common_peg_atomic_parser> ||
std::is_same_v<T, common_peg_gbnf_parser> ||
std::is_same_v<T, common_peg_schema_parser>) {
visit(p.child);
} else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
@@ -1659,13 +1651,10 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo
} else if constexpr (std::is_same_v<T, common_peg_sequence_parser>) {
std::string s;
for (const auto & child : p.children) {
auto child_gbnf = to_gbnf(child);
if (child_gbnf.empty()) {
continue;
}
if (!s.empty()) {
s += " ";
}
auto child_gbnf = to_gbnf(child);
const auto & child_parser = effective_parser(child);
if (std::holds_alternative<common_peg_choice_parser>(child_parser) ||
std::holds_alternative<common_peg_sequence_parser>(child_parser)) {
@@ -1765,8 +1754,6 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo
return to_gbnf(p.child);
} else if constexpr (std::is_same_v<T, common_peg_atomic_parser>) {
return to_gbnf(p.child);
} else if constexpr (std::is_same_v<T, common_peg_gbnf_parser>) {
return p.grammar;
} else {
static_assert(is_always_false_v<T>);
}
@@ -1901,8 +1888,6 @@ static nlohmann::json serialize_parser_variant(const common_peg_parser_variant &
{"child", p.child},
{"tag", p.tag}
};
} else if constexpr (std::is_same_v<T, common_peg_gbnf_parser>) {
return json{{"type", "gbnf"}, {"child", p.child}, {"grammar", p.grammar}};
}
}, variant);
}
@@ -2065,16 +2050,6 @@ static common_peg_parser_variant deserialize_parser_variant(const nlohmann::json
};
}
if (type == "gbnf") {
if (!j.contains("child") || !j.contains("grammar")) {
throw std::runtime_error("gbnf parser missing required fields");
}
return common_peg_gbnf_parser{
j["child"].get<common_peg_parser_id>(),
j["grammar"].get<std::string>(),
};
}
throw std::runtime_error("Unknown parser type: " + type);
}

View File

@@ -270,11 +270,6 @@ struct common_peg_tag_parser {
std::string tag;
};
struct common_peg_gbnf_parser {
common_peg_parser_id child;
std::string grammar;
};
// Variant holding all parser types
using common_peg_parser_variant = std::variant<
common_peg_epsilon_parser,
@@ -295,8 +290,7 @@ using common_peg_parser_variant = std::variant<
common_peg_rule_parser,
common_peg_ref_parser,
common_peg_atomic_parser,
common_peg_tag_parser,
common_peg_gbnf_parser
common_peg_tag_parser
>;
class common_peg_arena {
@@ -510,10 +504,6 @@ class common_peg_parser_builder {
// Unlike rules, you can tag multiple nodes with the same tag.
common_peg_parser tag(const std::string & tag, const common_peg_parser & p) { return add(common_peg_tag_parser{p.id(), tag}); }
// Wraps a child parser but emits a custom GBNF grammar string instead of
// the child's grammar. Parsing delegates entirely to the child.
common_peg_parser gbnf(const common_peg_parser & p, const std::string & grammar) { return add(common_peg_gbnf_parser{p, grammar}); }
void set_root(const common_peg_parser & p);
common_peg_arena build();

View File

@@ -287,8 +287,8 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
}
}
// reasoning budget sampler (skip when budget is unlimited unless a lazy grammar is active, which needs rbudget for thinking-block suppression)
if (!params.reasoning_budget_start.empty() && !params.reasoning_budget_end.empty() && (params.grammar_lazy || params.reasoning_budget_tokens >= 0)) {
// reasoning budget sampler
if (!params.reasoning_budget_start.empty() && !params.reasoning_budget_end.empty()) {
rbudget = common_reasoning_budget_init(
vocab,
params.reasoning_budget_start,

View File

@@ -1229,15 +1229,15 @@ class TextModel(ModelBase):
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) # ty: ignore[unresolved-attribute]
assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute]
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
assert max(tokenizer.vocab.values()) < vocab_size
tokpre = self.get_vocab_base_pre(tokenizer)
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute]
added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
added_vocab = tokenizer.get_added_vocab()
added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute]
added_tokens_decoder = tokenizer.added_tokens_decoder
for i in range(vocab_size):
if i not in reverse_vocab:
@@ -1250,7 +1250,7 @@ class TextModel(ModelBase):
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
if not added_tokens_decoder[i].normalized:
previous_token = token
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) # ty: ignore[unresolved-attribute, invalid-assignment]
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
if previous_token != token:
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
@@ -1583,13 +1583,13 @@ class TextModel(ModelBase):
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
vocab_size = hparams["vocab_size"]
assert max(tokenizer.get_vocab().values()) < vocab_size # ty: ignore[unresolved-attribute]
assert max(tokenizer.get_vocab().values()) < vocab_size
tokpre = self.get_vocab_base_pre(tokenizer)
merges = []
vocab = {}
mergeable_ranks = tokenizer.mergeable_ranks # ty: ignore[unresolved-attribute]
mergeable_ranks = tokenizer.mergeable_ranks
for token, rank in mergeable_ranks.items():
vocab[QwenModel.token_bytes_to_string(token)] = rank
if len(token) == 1:
@@ -1599,7 +1599,7 @@ class TextModel(ModelBase):
merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
# for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
added_vocab = tokenizer.special_tokens # ty: ignore[unresolved-attribute]
added_vocab = tokenizer.special_tokens
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
for i in range(vocab_size):
@@ -1622,10 +1622,10 @@ class TextModel(ModelBase):
special_vocab.merges = merges
# only add special tokens when they were not already loaded from config.json
if len(special_vocab.special_token_ids) == 0:
special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
# this one is usually not in config.json anyway
special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_sentencepiece(self, add_to_gguf=True):
@@ -1877,10 +1877,10 @@ class TextModel(ModelBase):
self.gguf_writer.add_tokenizer_pre(tokpre)
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"])
special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_glm(self):
@@ -1894,10 +1894,10 @@ class TextModel(ModelBase):
self.gguf_writer.add_token_types(toktypes)
# Special tokens
# Note: Using <|endoftext|> (151329) for eot causes endless generation
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["[gMASK]"]) # ty: ignore[unresolved-attribute] # 151331
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # ty: ignore[unresolved-attribute] # 151336
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] # 151329
special_vocab._set_special_token("eom", tokenizer.get_added_vocab()["<|observation|>"]) # ty: ignore[unresolved-attribute] # 151338
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["[gMASK]"]) # 151331
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # 151336
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # 151329
special_vocab._set_special_token("eom", tokenizer.get_added_vocab()["<|observation|>"]) # 151338
special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_interns1(self):
@@ -1906,16 +1906,16 @@ class TextModel(ModelBase):
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab()) # ty: ignore[unresolved-attribute]
vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab())
vocab_size = self.hparams.get("vocab_size", len(vocab))
assert max(vocab.values()) < vocab_size
tokpre = self.get_vocab_base_pre(tokenizer)
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()}
added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]
added_vocab = tokenizer.get_added_vocab()
added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute]
added_tokens_decoder = tokenizer.added_tokens_decoder
for i in range(vocab_size):
if i not in reverse_vocab:
@@ -1928,7 +1928,7 @@ class TextModel(ModelBase):
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
if not added_tokens_decoder[i].normalized:
previous_token = token
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) # ty: ignore[unresolved-attribute, invalid-assignment]
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
if previous_token != token:
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
@@ -2219,10 +2219,10 @@ class MmprojModel(ModelBase):
self.image_size = self.find_vparam(["image_size"])
self.gguf_writer.add_vision_image_size(self.image_size)
self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"]))
self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size", "width", "vt_hidden_size"]))
self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size", "vt_hidden_size"]))
self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size", "vt_intermediate_size"]))
self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys))
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads", "heads", "vt_num_attention_heads"]))
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads", "vt_num_attention_heads"]))
# preprocessor config
image_mean = _MISTRAL_COMMON_DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"]
@@ -2516,15 +2516,15 @@ class XverseModel(TextModel):
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(dir_model)
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) # ty: ignore[unresolved-attribute]
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
# Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size,
# because vocab_size is the count of items, and indexes start at 0.
max_vocab_index = max(tokenizer.get_vocab().values()) # ty: ignore[unresolved-attribute]
max_vocab_index = max(tokenizer.get_vocab().values())
if max_vocab_index >= vocab_size:
raise ValueError("Vocabulary size exceeds expected maximum size.")
reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute]
added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]
reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
added_vocab = tokenizer.get_added_vocab()
for token_id in range(vocab_size):
token_text = reverse_vocab[token_id].encode('utf-8')
@@ -2535,7 +2535,7 @@ class XverseModel(TextModel):
elif re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
toktype = gguf.TokenType.BYTE # special
elif reverse_vocab[token_id] in added_vocab:
if tokenizer.added_tokens_decoder[token_id].special: # ty: ignore[unresolved-attribute]
if tokenizer.added_tokens_decoder[token_id].special:
toktype = gguf.TokenType.CONTROL
else:
toktype = gguf.TokenType.USER_DEFINED
@@ -3752,7 +3752,7 @@ class QwenModel(TextModel):
@staticmethod
def token_bytes_to_string(b):
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode # ty: ignore[unresolved-import]
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
byte_encoder = bytes_to_unicode()
return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
@@ -3777,14 +3777,7 @@ class QwenModel(TextModel):
self._set_vocab_qwen()
@ModelBase.register(
"Qwen2Model",
"Qwen2ForCausalLM",
"Qwen2AudioForConditionalGeneration",
"KORMoForCausalLM",
"AudioFlamingo3ForConditionalGeneration",
"DotsOCRForCausalLM",
)
@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration", "KORMoForCausalLM", "AudioFlamingo3ForConditionalGeneration")
class Qwen2Model(TextModel):
model_arch = gguf.MODEL_ARCH.QWEN2
@@ -3805,8 +3798,7 @@ class Qwen2Model(TextModel):
name = name.replace("language_model.", "") # for InternVL
if name.startswith("mlp") or name.startswith("multi_modal_projector") \
or name.startswith("vision_model") or name.startswith("audio_tower") \
or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector") \
or name.startswith("vision_tower."):
or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector"):
# skip vision and audio tensors
return
yield from super().modify_tensors(data_torch, name, bid)
@@ -3823,14 +3815,14 @@ class DreamModel(TextModel):
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
vocab_dict = tokenizer.get_vocab() # ty: ignore[unresolved-attribute]
vocab_dict = tokenizer.get_vocab()
vocab_size = self.hparams.get("vocab_size", len(vocab_dict))
assert max(vocab_dict.values()) < vocab_size
tokpre = self.get_vocab_base_pre(tokenizer)
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab_dict.items()}
added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]
added_vocab = tokenizer.get_added_vocab()
for i in range(vocab_size):
if i not in reverse_vocab:
@@ -3888,14 +3880,14 @@ class LLaDAModel(TextModel):
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
vocab_dict = tokenizer.get_vocab() # ty: ignore[unresolved-attribute]
vocab_dict = tokenizer.get_vocab()
vocab_size = self.hparams.get("vocab_size", len(vocab_dict))
assert max(vocab_dict.values()) < vocab_size
tokpre = self.get_vocab_base_pre(tokenizer)
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab_dict.items()}
added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]
added_vocab = tokenizer.get_added_vocab()
for i in range(vocab_size):
if i not in reverse_vocab:
@@ -4258,7 +4250,9 @@ class Qwen2VLVisionModel(MmprojModel):
yield from super().modify_tensors(data_torch, name, bid)
class Qwen25AudioModel(MmprojModel):
@ModelBase.register("Qwen2_5OmniModel")
class Qwen25OmniModel(Qwen2VLVisionModel):
has_vision_encoder = True
has_audio_encoder = True
def __init__(self, *args, **kwargs):
@@ -4274,6 +4268,12 @@ class Qwen25AudioModel(MmprojModel):
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["num_mel_bins"])
self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-5))
def get_vision_config(self) -> dict[str, Any] | None:
return self.global_config["thinker_config"].get("vision_config")
def get_audio_config(self) -> dict[str, Any] | None:
return self.global_config["thinker_config"].get("audio_config")
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
# SinusoidsPositionEmbedding
assert self.hparams_audio is not None
@@ -4304,32 +4304,7 @@ class Qwen25AudioModel(MmprojModel):
# this tensor is left unused in transformers code
# https://github.com/huggingface/transformers/blob/6e3063422c4b1c014aa60c32b9254fd2902f0f28/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py#L1809
return
yield from MmprojModel.modify_tensors(self, data_torch, name, bid)
return # skip other tensors
@ModelBase.register("Qwen2_5OmniModel")
class Qwen25OmniModel(Qwen2VLVisionModel, Qwen25AudioModel):
has_audio_encoder = True
has_vision_encoder = True
def get_vision_config(self) -> dict[str, Any] | None:
return self.global_config["thinker_config"].get("vision_config")
def get_audio_config(self) -> dict[str, Any] | None:
return self.global_config["thinker_config"].get("audio_config")
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25O)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if "visual." in name:
yield from Qwen2VLVisionModel.modify_tensors(self, data_torch, name, bid)
elif "audio_tower." in name:
yield from Qwen25AudioModel.modify_tensors(self, data_torch, name, bid)
return # skip other tensors
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("InternVisionModel")
@@ -4690,9 +4665,9 @@ class Qwen3Model(Qwen2Model):
self.is_rerank = True
self.is_tied_embeddings = self.hparams.get("tie_word_embeddings", False)
self.token_false_id = tokenizer.convert_tokens_to_ids("no") # ty: ignore[unresolved-attribute, invalid-assignment]
self.token_true_id = tokenizer.convert_tokens_to_ids("yes") # ty: ignore[unresolved-attribute, invalid-assignment]
self.sep_token_id = tokenizer.convert_tokens_to_ids("|") # ty: ignore[unresolved-attribute]
self.token_false_id = tokenizer.convert_tokens_to_ids("no")
self.token_true_id = tokenizer.convert_tokens_to_ids("yes")
self.sep_token_id = tokenizer.convert_tokens_to_ids("|")
assert self.token_false_id is not None and self.token_true_id is not None
@@ -4833,10 +4808,7 @@ class RND1Model(Qwen2MoeModel):
class Qwen3VLVisionModel(MmprojModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
if self.hparams_vision is None:
logger.info("No vision config found, skipping vision tensor processing")
return
assert self.hparams_vision is not None
# Compute image_size if not present
if "image_size" not in self.hparams_vision:
# For Qwen3VL/Qwen3VLMoe, compute from num_position_embeddings
@@ -4857,9 +4829,7 @@ class Qwen3VLVisionModel(MmprojModel):
def set_gguf_parameters(self):
super().set_gguf_parameters()
# in case mixed modalities, the arch will be handled by subclass
if not self.has_audio_encoder:
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN3VL)
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN3VL)
self.gguf_writer.add_vision_use_gelu(True)
if self.hparams_vision is not None:
@@ -4947,64 +4917,11 @@ class Qwen3VLVisionModel(MmprojModel):
return
if name.startswith("visual."):
yield from MmprojModel.modify_tensors(self, data_torch, name, bid)
return # skip other tensors
yield from super().modify_tensors(data_torch, name, bid)
return
@ModelBase.register("Qwen3OmniMoeForConditionalGeneration")
class Qwen3OmniMmprojModel(Qwen3VLVisionModel, Qwen25AudioModel):
has_audio_encoder = True
has_vision_encoder = True
def get_vision_config(self) -> dict[str, Any] | None:
if self.has_vision_encoder:
return self.global_config["thinker_config"].get("vision_config")
else:
return None
def get_audio_config(self) -> dict[str, Any] | None:
if self.has_audio_encoder:
return self.global_config["thinker_config"].get("audio_config")
else:
return None
def set_gguf_parameters(self):
if self.has_vision_encoder:
Qwen3VLVisionModel.set_gguf_parameters(self)
self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.QWEN3VL)
if self.has_audio_encoder:
Qwen25AudioModel.set_gguf_parameters(self)
self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.QWEN3A)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if "visual." in name:
if not self.has_vision_encoder:
raise ValueError(f"Model does not have vision encoder, but found tensor {name}")
# need to transform vision tensor naming, so that modify_tensors() logic can be used correctly
name = name.replace("thinker.visual.", "model.visual.")
if ".merger_list." in name:
name = name.replace(".merger_list.", ".deepstack_merger_list.")
name = name.replace(".ln_q", ".norm")
name = name.replace(".mlp.0", ".linear_fc1")
name = name.replace(".mlp.2", ".linear_fc2")
elif ".merger." in name:
name = name.replace(".ln_q", ".norm")
name = name.replace(".mlp.0", ".linear_fc1")
name = name.replace(".mlp.2", ".linear_fc2")
yield from Qwen3VLVisionModel.modify_tensors(self, data_torch, name, bid)
elif "audio_tower." in name:
if not self.has_audio_encoder:
raise ValueError(f"Model does not have audio encoder, but found tensor {name}")
if "conv2d" in name and name.endswith(".bias"):
# transform conv2d bias [n_embd] --> [1, 1, n_embd]
data_torch = data_torch.unsqueeze(-1).unsqueeze(-1)
yield from Qwen25AudioModel.modify_tensors(self, data_torch, name, bid)
@ModelBase.register("Qwen3ASRForConditionalGeneration")
class Qwen3ASRMmprojModel(Qwen3OmniMmprojModel):
has_audio_encoder = True
has_vision_encoder = False
# Fall back to parent class for other tensors
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("Glm4vForConditionalGeneration", "Glm4vMoeForConditionalGeneration", "GlmOcrForConditionalGeneration")
@@ -5032,85 +4949,15 @@ class Glm4VVisionModel(Qwen3VLVisionModel):
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("StepVLForConditionalGeneration")
class Step3VLVisionModel(MmprojModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
assert self.hparams_vision is not None
if not self.hparams_vision.get("intermediate_size"):
hidden_size = self.hparams_vision.get("hidden_size") or self.hparams_vision.get("width") or 0
assert hidden_size > 0
mlp_ratio = float(self.hparams_vision.get("mlp_ratio", 8960 / 1536))
self.hparams_vision["intermediate_size"] = int(round(hidden_size * mlp_ratio))
self.preprocessor_config.setdefault("image_mean", list(_MISTRAL_COMMON_DATASET_MEAN))
self.preprocessor_config.setdefault("image_std", list(_MISTRAL_COMMON_DATASET_STD))
def set_gguf_parameters(self):
super().set_gguf_parameters()
assert self.hparams_vision is not None
projector_stride = int(self.global_config.get("understand_projector_stride", -1))
hidden_size = int(self.hparams_vision.get("hidden_size", self.hparams_vision.get("width", -1)))
num_layers = int(self.hparams_vision.get("num_hidden_layers", self.hparams_vision.get("layers", -1)))
assert (projector_stride, int(self.hparams_vision.get("image_size", -1)), hidden_size, num_layers) == (2, 728, 1536, 47), (
"current Step3-VL conversion path is only validated for Step3-VL-10B"
)
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.STEP3VL)
self.gguf_writer.add_vision_attention_layernorm_eps(float(self.hparams_vision.get("layer_norm_eps", 1e-5)))
self.gguf_writer.add_vision_projector_scale_factor(projector_stride ** 2)
# 3024 max resize comes from step3-vl-10b processing_step3.py.
self.gguf_writer.add_vision_preproc_image_size(3024)
def tensor_force_quant(self, name, new_name, bid, n_dims):
if ".position_embd." in new_name:
return gguf.GGMLQuantizationType.F32
if ("mm.0." in new_name or "mm.1." in new_name) and new_name.endswith(".weight"):
return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
return super().tensor_force_quant(name, new_name, bid, n_dims)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.startswith("model.") or name.startswith("lm_head."):
return
if name.startswith("vision_model.vit_downsampler"):
match = re.match(r"vision_model\.vit_downsampler(\d+)\.(weight|bias)", name)
if match is None:
raise ValueError(f"Unexpected Step3-VL projector tensor {name!r}")
proj_id = int(match.group(1)) - 1
suffix = f".{match.group(2)}"
yield (self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, proj_id, suffix=suffix), data_torch)
return
if name == "vit_large_projector.weight":
yield (self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ_FC), data_torch)
return
if name.startswith("vision_model."):
if name == "vision_model.positional_embedding":
name += ".weight"
elif name.endswith(".gamma") and ".ls_" in name:
name = name.removesuffix(".gamma") + ".weight"
name = name.replace("attn.in_proj_weight", "attn.in_proj.weight")
name = name.replace("attn.in_proj_bias", "attn.in_proj.bias")
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("Qwen3VLForConditionalGeneration")
class Qwen3VLTextModel(Qwen3Model):
model_arch = gguf.MODEL_ARCH.QWEN3VL
def set_gguf_parameters(self):
super().set_gguf_parameters()
if "thinker_config" in self.hparams:
vision_config = self.hparams["thinker_config"].get("vision_config", {})
else:
vision_config = self.hparams.get("vision_config", {})
# Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL
vision_config = self.hparams.get("vision_config", {})
deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
@@ -5122,16 +4969,6 @@ class Qwen3VLTextModel(Qwen3Model):
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("StepVLForConditionalGeneration")
class Step3VLTextModel(Qwen3Model):
model_arch = gguf.MODEL_ARCH.QWEN3
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.startswith("vision_model.") or name.startswith("model.vision_model.") or name.startswith("vit_large_projector."):
return
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("Qwen3VLMoeForConditionalGeneration")
class Qwen3VLMoeTextModel(Qwen3MoeModel):
model_arch = gguf.MODEL_ARCH.QWEN3VLMOE
@@ -5179,70 +5016,6 @@ class Qwen3VLMoeTextModel(Qwen3MoeModel):
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("Qwen3OmniMoeForConditionalGeneration")
class Qwen3OmniMoeTextModel(Qwen3VLMoeTextModel):
model_arch = gguf.MODEL_ARCH.QWEN3VLMOE
def set_vocab(self):
super().set_vocab()
# correct BOS/EOS tokens
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
tokenizer_config = json.load(f)
added_tokens = tokenizer_config.get("added_tokens_decoder", {})
for token_id, data in added_tokens.items():
if data.get("content") == "<|im_end|>":
self.gguf_writer.add_bos_token_id(int(token_id))
self.gguf_writer.add_eos_token_id(int(token_id))
break
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_num_deepstack_layers(0)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# Skip vision and audio tensors - they go in the mmproj file
if "visual." in name or "audio_tower." in name \
or "talker." in name or "code2wav." in name:
return
name = name.replace("thinker.", "")
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("Qwen3ASRForConditionalGeneration")
class Qwen3ASRTextModel(Qwen3VLTextModel):
model_arch = gguf.MODEL_ARCH.QWEN3VL
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_num_deepstack_layers(0)
def set_vocab(self):
super().set_vocab()
# fix chat template, use correct chatml format
self.gguf_writer.add_chat_template("{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}")
# correct BOS/EOS tokens
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
tokenizer_config = json.load(f)
added_tokens = tokenizer_config.get("added_tokens_decoder", {})
for token_id, data in added_tokens.items():
if data.get("content") == "<|im_end|>":
self.gguf_writer.add_bos_token_id(int(token_id))
self.gguf_writer.add_eos_token_id(int(token_id))
break
def modify_tensors(self, data_torch, name, bid):
# qwen3-omni
name = name.replace("thinker.", "")
# Skip vision and audio tensors - they go in the mmproj file
if "visual." in name or "audio_tower." in name \
or "talker." in name or "code2wav." in name:
return
yield from super().modify_tensors(data_torch, name, bid)
class _LinearAttentionVReorderBase(Qwen3NextModel):
model_arch = gguf.MODEL_ARCH.QWEN3NEXT # overridden by subclasses
"""reorders V heads from grouped to tiled order for ggml broadcast
@@ -6086,7 +5859,7 @@ class KimiLinearModel(TextModel):
# Build merges list using the approach similar to HunYuanMoE
merges = []
vocab = {}
mergeable_ranks = tokenizer.model._mergeable_ranks # ty: ignore[unresolved-attribute]
mergeable_ranks = tokenizer.model._mergeable_ranks
for token, rank in mergeable_ranks.items():
vocab[QwenModel.token_bytes_to_string(token)] = rank
if len(token) == 1:
@@ -6096,7 +5869,7 @@ class KimiLinearModel(TextModel):
merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
# Build token list
vocab_size = self.hparams["vocab_size"]
special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute]
special_tokens = tokenizer.special_tokens
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
tokens: list[str] = []
toktypes: list[int] = []
@@ -6122,7 +5895,7 @@ class KimiLinearModel(TextModel):
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
special_vocab.add_to_gguf(self.gguf_writer)
# override eos id in config.json with tiktoken eos id
self.gguf_writer.add_eos_token_id(tokenizer.eos_id) # ty: ignore[unresolved-attribute]
self.gguf_writer.add_eos_token_id(tokenizer.eos_id)
else:
raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!")
@@ -6616,11 +6389,11 @@ class BertModel(TextModel):
with open(tokenizer_config_path, "r", encoding="utf-8") as fp:
tokenizer_config_json = json.load(fp)
add_prefix = tokenizer.add_prefix_space # ty: ignore[unresolved-attribute]
remove_whitespaces = tokenizer.clean_up_tokenization_spaces # ty: ignore[unresolved-attribute]
add_prefix = tokenizer.add_prefix_space
remove_whitespaces = tokenizer.clean_up_tokenization_spaces
precompiled_charsmap = b64decode(tokenizer_json["normalizer"]["precompiled_charsmap"])
vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size) # ty: ignore[unresolved-attribute]
vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size)
else:
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
@@ -6637,7 +6410,7 @@ class BertModel(TextModel):
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
scores: list[float] = [-10000.0] * vocab_size
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size # ty: ignore[invalid-assignment]
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
if isinstance(tokenizer, SentencePieceProcessor):
for token_id in range(tokenizer.vocab_size()):
@@ -6659,20 +6432,20 @@ class BertModel(TextModel):
scores[token_id] = score
toktypes[token_id] = toktype
else:
added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]
added_vocab = tokenizer.get_added_vocab()
unk_token = tokenizer_config_json.get("unk_token")
unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3)) # ty: ignore[no-matching-overload]
unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3))
for token_id in range(tokenizer.vocab_size): # ty: ignore[unresolved-attribute]
piece = tokenizer._convert_id_to_token(token_id) # ty: ignore[unresolved-attribute]
if (piece := tokenizer._convert_id_to_token(token_id)) is not None: # ty: ignore[unresolved-attribute]
for token_id in range(tokenizer.vocab_size):
piece = tokenizer._convert_id_to_token(token_id)
if (piece := tokenizer._convert_id_to_token(token_id)) is not None:
text = piece.encode("utf-8")
score = tokenizer_json["model"]["vocab"][token_id][1]
toktype = SentencePieceTokenTypes.NORMAL
if token_id == unk_token_id:
toktype = SentencePieceTokenTypes.UNKNOWN
elif token_id in tokenizer.all_special_ids: # ty: ignore[unresolved-attribute]
elif token_id in tokenizer.all_special_ids:
toktype = SentencePieceTokenTypes.CONTROL
elif token_id in added_vocab.values():
toktype = SentencePieceTokenTypes.USER_DEFINED
@@ -8981,7 +8754,7 @@ class DeepseekV2Model(TextModel):
# Build merges list using the approach similar to HunYuanMoE
merges = []
vocab = {}
mergeable_ranks = tokenizer.model._mergeable_ranks # ty: ignore[unresolved-attribute]
mergeable_ranks = tokenizer.model._mergeable_ranks
for token, rank in mergeable_ranks.items():
vocab[QwenModel.token_bytes_to_string(token)] = rank
if len(token) == 1:
@@ -8992,7 +8765,7 @@ class DeepseekV2Model(TextModel):
# Build token list
vocab_size = self.hparams["vocab_size"]
special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute]
special_tokens = tokenizer.special_tokens
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
tokens: list[str] = []
toktypes: list[int] = []
@@ -9963,10 +9736,10 @@ class Glm4Model(TextModel):
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"])
special_vocab.add_to_gguf(self.gguf_writer)
def set_gguf_parameters(self):
@@ -10194,12 +9967,12 @@ class ChatGLMModel(TextModel):
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab())) # ty: ignore[unresolved-attribute]
assert max(tokenizer.get_vocab().values()) < vocab_size # ty: ignore[unresolved-attribute]
vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab()))
assert max(tokenizer.get_vocab().values()) < vocab_size
role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"]
special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens
for token_id in range(vocab_size):
piece = tokenizer._convert_id_to_token(token_id) # ty: ignore[unresolved-attribute]
piece = tokenizer._convert_id_to_token(token_id)
if token_id == 0:
piece = "<unk>"
elif token_id == 1:
@@ -10207,17 +9980,17 @@ class ChatGLMModel(TextModel):
elif token_id == 2:
piece = "<eos>"
text = piece.encode("utf-8") # ty: ignore[unresolved-attribute]
text = piece.encode("utf-8")
score = 0.0
# Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py),
# it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size()
if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size(): # ty: ignore[unresolved-attribute, invalid-argument-type]
score = tokenizer.tokenizer.sp_model.get_score(token_id) # ty: ignore[unresolved-attribute]
if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size():
score = tokenizer.tokenizer.sp_model.get_score(token_id)
if token_id >= tokenizer.tokenizer.sp_model.vocab_size(): # ty: ignore[unresolved-attribute]
if token_id >= tokenizer.tokenizer.sp_model.vocab_size():
if piece in special_tokens:
toktype = SentencePieceTokenTypes.CONTROL
elif len(piece) == 0: # ty: ignore[invalid-argument-type]
elif len(piece) == 0:
text = f"[PAD{token_id}]".encode("utf-8")
toktype = SentencePieceTokenTypes.UNUSED
else:
@@ -10228,13 +10001,13 @@ class ChatGLMModel(TextModel):
continue
toktype = SentencePieceTokenTypes.NORMAL
if tokenizer.tokenizer.sp_model.is_unknown(token_id): # ty: ignore[unresolved-attribute]
if tokenizer.tokenizer.sp_model.is_unknown(token_id):
toktype = SentencePieceTokenTypes.UNKNOWN
elif tokenizer.tokenizer.sp_model.is_control(token_id): # ty: ignore[unresolved-attribute]
elif tokenizer.tokenizer.sp_model.is_control(token_id):
toktype = SentencePieceTokenTypes.CONTROL
elif tokenizer.tokenizer.sp_model.is_unused(token_id): # ty: ignore[unresolved-attribute]
elif tokenizer.tokenizer.sp_model.is_unused(token_id):
toktype = SentencePieceTokenTypes.UNUSED
elif tokenizer.tokenizer.sp_model.is_byte(token_id): # ty: ignore[unresolved-attribute]
elif tokenizer.tokenizer.sp_model.is_byte(token_id):
toktype = SentencePieceTokenTypes.BYTE
tokens.append(text)
@@ -10254,7 +10027,7 @@ class ChatGLMModel(TextModel):
@staticmethod
def token_bytes_to_string(b):
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode # ty: ignore[unresolved-import]
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
byte_encoder = bytes_to_unicode()
return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
@@ -10288,7 +10061,7 @@ class ChatGLMModel(TextModel):
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
vocab_size = hparams.get("padded_vocab_size",hparams["vocab_size"])
assert max(tokenizer.get_vocab().values()) < vocab_size # ty: ignore[unresolved-attribute]
assert max(tokenizer.get_vocab().values()) < vocab_size
tokens, toktypes, tokpre = self.get_vocab_base()
self.gguf_writer.add_tokenizer_model("gpt2")
@@ -10297,10 +10070,10 @@ class ChatGLMModel(TextModel):
self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
# only add special tokens when they were not already loaded from config.json
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
# this one is usually not in config.json anyway
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
special_vocab.add_to_gguf(self.gguf_writer)
def set_gguf_parameters(self):
@@ -11421,48 +11194,6 @@ class UltravoxWhisperEncoderModel(WhisperEncoderModel):
self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])
@ModelBase.register("MERaLiON2ForConditionalGeneration")
class MERaLiONWhisperEncoderModel(WhisperEncoderModel):
has_vision_encoder = False
has_audio_encoder = True
def get_audio_config(self) -> dict[str, Any] | None:
return self.global_config.get("speech_config")
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MERALION)
self.gguf_writer.add_audio_stack_factor(self.global_config.get("speech_mlp_scale_factor", 15))
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.startswith("text_decoder."):
return
if name.startswith("speech_encoder."):
name = name.replace("speech_encoder.", "audio_tower.")
yield from super().modify_tensors(data_torch, name, bid)
return
suffix = "." + name.rsplit(".", 1)[-1]
if name.startswith("ln_speech."):
yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MM_NORM_PRE, suffix=suffix), data_torch)
return
if name.startswith("speech_audio_adapter."):
if ".mlp_adapter.0." in name:
yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 0, suffix=suffix), data_torch)
elif ".gate_proj." in name:
yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 1, suffix=suffix), data_torch)
elif ".pool_proj." in name:
yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 2, suffix=suffix), data_torch)
elif ".out_proj." in name:
yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 3, suffix=suffix), data_torch)
return
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("VoxtralForConditionalGeneration")
class VoxtralWhisperEncoderModel(WhisperEncoderModel):
has_vision_encoder = False # no vision encoder
@@ -11608,7 +11339,7 @@ class HunYuanMoEModel(TextModel):
# 2. Reverse-engineer the merges list from mergeable_ranks
merges = []
vocab = {}
mergeable_ranks = tokenizer.mergeable_ranks # ty: ignore[unresolved-attribute]
mergeable_ranks = tokenizer.mergeable_ranks
for token, rank in mergeable_ranks.items():
vocab[QwenModel.token_bytes_to_string(token)] = rank
if len(token) == 1:
@@ -11619,8 +11350,8 @@ class HunYuanMoEModel(TextModel):
# 3. Generate the tokens and toktypes lists
vocab_size = self.hparams["vocab_size"]
assert tokenizer.vocab_size == vocab_size # ty: ignore[unresolved-attribute]
special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute]
assert tokenizer.vocab_size == vocab_size
special_tokens = tokenizer.special_tokens
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
tokens: list[str] = []
toktypes: list[int] = []
@@ -11844,7 +11575,7 @@ class HunYuanModel(TextModel):
# 2. Reverse-engineer the merges list from mergeable_ranks
merges = []
vocab = {}
mergeable_ranks = tokenizer.mergeable_ranks # ty: ignore[unresolved-attribute]
mergeable_ranks = tokenizer.mergeable_ranks
for token, rank in mergeable_ranks.items():
vocab[QwenModel.token_bytes_to_string(token)] = rank
if len(token) == 1:
@@ -11855,8 +11586,8 @@ class HunYuanModel(TextModel):
# 3. Generate the tokens and toktypes lists
vocab_size = self.hparams["vocab_size"]
assert tokenizer.vocab_size == vocab_size # ty: ignore[unresolved-attribute]
special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute]
assert tokenizer.vocab_size == vocab_size
special_tokens = tokenizer.special_tokens
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
tokens: list[str] = []
toktypes: list[int] = []
@@ -13004,44 +12735,13 @@ class SolarOpenModel(Glm4MoeModel):
self.gguf_writer.add_tokenizer_pre(tokpre)
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<unk>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|startoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|endoftext|>"])
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<unk>"])
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|startoftext|>"])
special_vocab.add_to_gguf(self.gguf_writer)
@ModelBase.register("DotsOCRForCausalLM")
class DotsOCRVisionModel(MmprojModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
assert self.hparams_vision is not None
self.hparams_vision["image_size"] = 0 # dynamic resolution
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.DOTSOCR)
self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
self.gguf_writer.add_vision_attention_layernorm_eps(self.find_vparam(["rms_norm_eps"]))
self.gguf_writer.add_vision_projector_scale_factor(self.find_vparam(["spatial_merge_size"]))
self.gguf_writer.add_vision_use_silu(True)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.startswith("vision_tower."):
if "vision_tower.blocks." in name and ".mlp." in name:
# note: to avoid naming conflicts in tensor_mapping.py, we need to handle FFN renaming here
# x = F.silu(self.fc1(x)) * self.fc3(x)
# x = self.fc2(x)
# fc1 -> gate, fc2 -> down, fc3 -> up
# mapping original names to Qwen2.5 naming scheme
name = name.replace("vision_tower.blocks.", "visual.blocks.")
name = name.replace(".fc1", ".gate_proj")
name = name.replace(".fc2", ".down_proj")
name = name.replace(".fc3", ".up_proj")
yield from super().modify_tensors(data_torch, name, bid)
###### CONVERSION LOGIC ######
@@ -13294,12 +12994,6 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st
# For non-hf Mamba and Mamba2 models
arch = hparams["ssm_cfg"].get("layer", "Mamba") + "ForCausalLM"
# Step3-VL keeps text config under text_config but uses a custom top-level architecture.
# For text conversion we route to a dedicated text-only class.
# TODO: refactor this later to avoid adding exception here
if model_type == ModelType.TEXT and arch == "StepVLForConditionalGeneration":
return arch
# if "architectures" is found in the sub-config, use that instead
if model_type == ModelType.TEXT and text_config.get("architectures") is not None:
arch = text_config["architectures"][0]

View File

@@ -296,7 +296,7 @@ for model in [*pre_computed_hashes, *all_models]:
except Exception as e:
raise OSError(f"Error loading tokenizer for model {name}.") from e
chktok = tokenizer.encode(CHK_TXT) # ty: ignore[unresolved-attribute]
chktok = tokenizer.encode(CHK_TXT)
chkhsh = sha256(str(chktok).encode()).hexdigest()
logger.info(f"model: {name}")
@@ -468,7 +468,7 @@ for model in models:
with open(f"models/ggml-vocab-{name}.gguf.out", "w") as f:
for text in tests:
res = tokenizer.encode(text, add_special_tokens=False) # ty: ignore[unresolved-attribute]
res = tokenizer.encode(text, add_special_tokens=False)
for r in res:
f.write(f" {r}")
f.write("\n")

View File

@@ -402,7 +402,7 @@ if __name__ == '__main__':
# the invocation string includes the "<|start_of_turn|>"
# token, but the adapters themselves were trained to
# activate _after_ that first token, so we drop it here.
alora_invocation_tokens = tokenizer(invocation_string)["input_ids"][1:] # ty: ignore[call-non-callable]
alora_invocation_tokens = tokenizer(invocation_string)["input_ids"][1:]
if alora_invocation_tokens:
logger.debug("GGUF KV: %s = %s", gguf.Keys.Adapter.ALORA_INVOCATION_TOKENS, alora_invocation_tokens)
self.gguf_writer.add_key_value(

View File

@@ -3,7 +3,7 @@
> [!NOTE]
> Performance and memory optimizations, accuracy validation, broader quantization coverage, broader operator and model support are work in progress.
[OpenVINO](https://docs.openvino.ai/) is an open-source toolkit for optimizing and deploying high-performance AI inference, specifically designed for Intel hardware, including CPUs, GPUs, and NPUs, in the cloud, on-premises, and on the edge. [OpenVINO backend for llama.cpp](../../ggml/src/ggml-openvino) enables hardware-accelerated inference on **Intel® CPUs, GPUs, and NPUs** while remaining compatible with the existing **GGUF model ecosystem**. The backend translates GGML compute graphs into OpenVINO graphs and leverages graph compilation, kernel fusion, and device-specific optimizations to improve inference performance on supported Intel hardware.
[OpenVINO](https://docs.openvino.ai/) is an open-source toolkit for optimizing and deploying high-performance AI inference, specifically designed for Intel hardware, including CPUs, GPUs, and NPUs, in the cloud, on-premises, and on the edge. [OpenVINO backend for llama.cpp](../../src/ggml-openvino) enables hardware-accelerated inference on **Intel® CPUs, GPUs, and NPUs** while remaining compatible with the existing **GGUF model ecosystem**. The backend translates GGML compute graphs into OpenVINO graphs and leverages graph compilation, kernel fusion, and device-specific optimizations to improve inference performance on supported Intel hardware.
The OpenVINO backend is implemented in `ggml/src/ggml-openvino` and provides a translation layer for core GGML operations. The OpenVINO backend replaces the standard GGML graph execution path with Intel's OpenVINO inference engine. This approach allows the same GGUF model file to run on Intel CPUs, Intel GPUs (integrated and discrete), and Intel NPUs without changes to the model or the rest of the llama.cpp stack. When a `ggml_cgraph` is dispatched to OpenVINO backend, it:

View File

@@ -52,39 +52,10 @@
}
},
{
"name": "arm64-linux-snapdragon",
"hidden": true,
"architecture": { "value": "arm64", "strategy": "external" },
"toolset": { "value": "host=x86_64", "strategy": "external" },
"cacheVariables": {
"CMAKE_TOOLCHAIN_FILE": "cmake/arm64-linux-clang.cmake",
"CMAKE_C_FLAGS": "-march=armv8 -fno-finite-math-only -flto -D_GNU_SOURCE",
"CMAKE_CXX_FLAGS": "-march=armv8 -fno-finite-math-only -flto -D_GNU_SOURCE",
"CMAKE_C_FLAGS_RELEASE": "-O3 -DNDEBUG",
"CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG",
"CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
"CMAKE_CXX_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
"CMAKE_PREFIX_PATH": "$env{OPENCL_SDK_ROOT}",
"HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}",
"HEXAGON_TOOLS_ROOT": "$env{HEXAGON_TOOLS_ROOT}",
"PREBUILT_LIB_DIR": "linux_aarch64",
"GGML_OPENMP": "OFF",
"GGML_LLAMAFILE": "OFF",
"GGML_OPENCL": "OFF",
"GGML_HEXAGON": "ON",
"GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
"LLAMA_OPENSSL": "OFF"
}
},
{ "name": "arm64-android-snapdragon-debug" , "inherits": [ "base", "arm64-android-snapdragon", "debug" ] },
{ "name": "arm64-android-snapdragon-release", "inherits": [ "base", "arm64-android-snapdragon", "release" ] },
{ "name": "arm64-windows-snapdragon-debug" , "inherits": [ "base", "arm64-windows-snapdragon", "debug" ] },
{ "name": "arm64-windows-snapdragon-release", "inherits": [ "base", "arm64-windows-snapdragon", "release" ] },
{ "name": "arm64-linux-snapdragon-debug" , "inherits": [ "base", "arm64-linux-snapdragon", "debug" ] },
{ "name": "arm64-linux-snapdragon-release", "inherits": [ "base", "arm64-linux-snapdragon", "release" ] }
{ "name": "arm64-windows-snapdragon-release", "inherits": [ "base", "arm64-windows-snapdragon", "release" ] }
]
}

View File

@@ -236,6 +236,10 @@ build: 6a8cf8914 (6733)
Controls whether the Hexagon backend allocates host buffers. By default, all buffers except for REPACK are host buffers.
This option is required for testing Ops that require REPACK buffers (MUL_MAT and MUL_MAT_ID).
- `GGML_HEXAGON_EXPERIMENTAL=1`
Controls whether the Hexagon backend enables experimental features.
This option is required for enabling/testing experimental Ops (FLASH_ATTN_EXT).
- `GGML_HEXAGON_VERBOSE=1`
Enables verbose logging of Ops from the backend. Example output:
@@ -255,17 +259,11 @@ build: 6a8cf8914 (6733)
Allows enabling specific stages of the processing pipeline:
- `0x1` Enable Op Queue (i.e., queuing Ops into NPU)
- `0x2` Enable Op Compute (MUL_MAT, etc.)
- `0x2` Enable Dynamic Quantizer (if needed for the Op)
- `0x4` Enable Op Compute (MUL_MAT, etc.)
Examples:
`GGML_HEXAGON_OPMASK=0x1 llama-completion ...` - Ops are enqueued but NPU-side processing is stubbed out
`GGML_HEXAGON_OPMASK=0x3 llama-completion ...` - Full queuing and processing of Ops (default)
- `GGML_HEXAGON_OPFILTER=regex`
Allows filtering (disabling) Ops that match the regex pattern:
Examples:
`GGML_HEXAGON_OPFILTER="FLASH_ATTN_EXT" llama-completion ...` - Disable Flash Attention on Hexagon (falls back to CPU or GPU)
`GGML_HEXAGON_OPFILTER="ADD\|SUB" llama-completion ...` - Disable ADD and SUB on Hexagon (fall back to CPU or GPU)
`GGML_HEXAGON_OPMASK=0x3 llama-completion ...` - NPU performs dynamic quantization and skips the rest
`GGML_HEXAGON_OPMASK=0x7 llama-completion ...` - Full queuing and processing of Ops (default)

View File

@@ -1,58 +0,0 @@
# Snapdragon-based Linux devices
## Docker Setup
The easiest way to build llama.cpp for a Snapdragon-based Linux device is using the toolchain Docker image (see [github.com/snapdragon-toolchain](https://github.com/snapdragon-toolchain)).
This image includes OpenCL SDK, Hexagon SDK, CMake, and the ARM64 Linux cross-compilation toolchain.
Cross-compilation is supported on **Linux X86** hosts. The resulting binaries are deployed to and run on the target **Qualcomm Snapdragon ARM64 Linux** device.
```
~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-linux:v0.1
[d]/> cd /workspace
```
Note: The rest of the **Linux** build process assumes that you're running inside the toolchain container.
## How to Build
Let's build llama.cpp with CPU, OpenCL, and Hexagon backends via CMake presets:
```
[d]/workspace> cp docs/backend/snapdragon/CMakeUserPresets.json .
[d]/workspace> cmake --preset arm64-linux-snapdragon-release -B build-snapdragon
[d]/workspace> cmake --build build-snapdragon -j $(nproc)
```
To generate an installable "package" simply use cmake --install, then zip it:
```
[d]/workspace> cmake --install build-snapdragon --prefix pkg-snapdragon
[d]/workspace> zip -r pkg-snapdragon.zip pkg-snapdragon
```
## How to Install
For this step, you will deploy the built binaries and libraries to the target Linux device. Transfer `pkg-snapdragon.zip` to the target device, then unzip it and set up the environment variables:
```
$ unzip pkg-snapdragon.zip
$ cd pkg-snapdragon
$ export LD_LIBRARY_PATH=./lib
$ export ADSP_LIBRARY_PATH=./lib
```
At this point, you should also download some models onto the device:
```
$ wget https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF/resolve/main/Llama-3.2-3B-Instruct-Q4_0.gguf
```
## How to Run
Next, since we have setup the environment variables, we can run the llama-cli with the Hexagon backends:
```
$ ./bin/llama-cli -m Llama-3.2-3B-Instruct-Q4_0.gguf --device HTP0 -ngl 99 -p "what is the most popular cookie in the world?"
```

View File

@@ -5,7 +5,6 @@ Adding a model requires few steps:
1. Convert the model to GGUF
2. Define the model architecture in `llama.cpp`
3. Build the GGML graph implementation
4. Optional: Add multimodal encoder implementation
After following these steps, you can open PR.
@@ -115,21 +114,6 @@ Some `ggml` backends do not support all operations. Backend implementations can
Note: to debug the inference graph: you can use [llama-eval-callback](/examples/eval-callback/).
### 4. Optional: Add multimodal encoder implementation
If the new model supports multimodal inputs, you will need to add a new encoder definition in `libmtmd`. You can find more information about llama.cpp's multimodal support in [the docs](../multimodal.md) and in the `tools/mtmd` source directory.
1. In the conversion script, make sure you add a subclass that extends `MmprojModel` or another class that inherits from the same base class.
2. Add the encoder definition in `clip.cpp`.
3. Implement the preprocessor in `mtmd.cpp`. In most cases, you can reuse an existing preprocessor.
4. Implement the encoder GGML graph, either in a dedicated file if the model is truly different from existing ones, or by reusing an existing implementation (for example: siglip, pixtral, or qwen) and adding a model-specific projector.
Note:
- Many multimodal encoders are based on models that are already supported. Make sure to read the existing encoder definitions in `tools/mtmd/models` before adding a new one. In `libmtmd`, it is generally better to extend an existing model than to duplicate code.
- To debug the multimodal preprocessor and encoder, you can use [llama-mtmd-debug](tools/mtmd/debug/mtmd-debug.cpp).
- Adding a model-specific API or CLI is an anti-pattern in `libmtmd`. The goal of `libmtmd` is to provide an easy-to-use, model-agnostic library for multimodal pipeline.
- In most cases, `llama-mtmd-cli` should not be modified. If a model requires a specific prompt, either let the user provide it or bake it into the Jinja chat template.
## GGUF specification
https://github.com/ggml-org/ggml/blob/master/docs/gguf.md

View File

@@ -37,7 +37,6 @@ llama-server -hf ggml-org/gemma-3-4b-it-GGUF --no-mmproj-offload
> - PaddleOCR-VL: https://github.com/ggml-org/llama.cpp/pull/18825
> - GLM-OCR: https://github.com/ggml-org/llama.cpp/pull/19677
> - Deepseek-OCR: https://github.com/ggml-org/llama.cpp/pull/17400
> - Dots.OCR: https://github.com/ggml-org/llama.cpp/pull/17575
> - HunyuanOCR: https://github.com/ggml-org/llama.cpp/pull/21395
## Pre-quantized models
@@ -94,11 +93,6 @@ NOTE: some models may require large context window, for example: `-c 8192`
# Moondream2 20250414 version
(tool_name) -hf ggml-org/moondream2-20250414-GGUF
# Gemma 4
(tool_name) -hf ggml-org/gemma-4-E2B-it-GGUF
(tool_name) -hf ggml-org/gemma-4-E4B-it-GGUF
(tool_name) -hf ggml-org/gemma-4-26B-A4B-it-GGUF
(tool_name) -hf ggml-org/gemma-4-31B-it-GGUF
```
**Audio models**:
@@ -114,10 +108,6 @@ NOTE: some models may require large context window, for example: `-c 8192`
# Mistral's Voxtral
(tool_name) -hf ggml-org/Voxtral-Mini-3B-2507-GGUF
# Qwen3-ASR
(tool_name) -hf ggml-org/Qwen3-ASR-0.6B-GGUF
(tool_name) -hf ggml-org/Qwen3-ASR-1.7B-GGUF
```
**Mixed modalities**:
@@ -127,16 +117,6 @@ NOTE: some models may require large context window, for example: `-c 8192`
# Capabilities: audio input, vision input
(tool_name) -hf ggml-org/Qwen2.5-Omni-3B-GGUF
(tool_name) -hf ggml-org/Qwen2.5-Omni-7B-GGUF
# Qwen3 Omni
# Capabilities: audio input, vision input
(tool_name) -hf ggml-org/Qwen3-Omni-30B-A3B-Instruct-GGUF
(tool_name) -hf ggml-org/Qwen3-Omni-30B-A3B-Thinking-GGUF
# Gemma 4
# Capabilities: audio input, vision input
(tool_name) -hf ggml-org/gemma-4-E2B-it-GGUF
(tool_name) -hf ggml-org/gemma-4-E4B-it-GGUF
```
## Finding more models:

View File

@@ -9,7 +9,6 @@
#include <vector>
#include <filesystem>
#include <fstream>
#include <optional>
#include <regex>
static void print_usage(int /*argc*/, char ** argv) {
@@ -223,10 +222,7 @@ int main(int argc, char ** argv) {
llama_backend_init();
llama_numa_init(params.numa);
std::optional<base_callback_data> cb_data;
if (!params.save_logits) {
cb_data.emplace(params, params.tensor_filter);
}
base_callback_data cb_data(params, params.tensor_filter);
auto llama_init = common_init_from_params(params);

View File

@@ -53,10 +53,10 @@ model_name = os.path.basename(model_path)
print(f"Model name: {model_name}")
prompt = "Hello world today"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids # ty: ignore[call-non-callable]
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
print(f"Input tokens: {input_ids}")
print(f"Input text: {repr(prompt)}")
print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}") # ty: ignore[unresolved-attribute]
print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}")
with torch.no_grad():
outputs = model(input_ids, output_hidden_states=True)
@@ -92,7 +92,7 @@ with torch.no_grad():
# Print embeddings per token in the requested format
print("\nToken embeddings:")
tokens = tokenizer.convert_ids_to_tokens(input_ids[0]) # ty: ignore[unresolved-attribute]
tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
for i, embedding in enumerate(token_embeddings):
# Format: show first few values, ..., then last few values
if len(embedding) > 10:

View File

@@ -207,8 +207,8 @@ def main():
else:
model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True)
encoded = tokenizer(prompt, return_tensors="pt") # ty: ignore[call-non-callable]
tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0]) # ty: ignore[unresolved-attribute]
encoded = tokenizer(prompt, return_tensors="pt")
tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])
n_tokens = len(tokens)
print(f"n_tokens: {n_tokens}");
print(f"hidden_size: {model.config.hidden_size}")

View File

@@ -7,8 +7,6 @@ set(GGML_VERSION_MINOR 9)
set(GGML_VERSION_PATCH 11)
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
if(GIT_EXE)
# Get current git commit hash
@@ -206,14 +204,12 @@ option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM"
option(GGML_CUDA_FA "ggml: compile ggml FlashAttention CUDA kernels" ON)
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
option(GGML_CUDA_NCCL "ggml: use NVIDIA Collective Comm. Library" ON)
set (GGML_CUDA_COMPRESSION_MODE "size" CACHE STRING
"ggml: cuda link binary compression mode; requires cuda 12.8+")
set_property(CACHE GGML_CUDA_COMPRESSION_MODE PROPERTY STRINGS "none;speed;balance;size")
option(GGML_HIP "ggml: use HIP" OFF)
option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
option(GGML_HIP_RCCL "ggml: use ROCm Collective Comm. Library" OFF)
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
option(GGML_HIP_MMQ_MFMA "ggml: enable MFMA MMA for CDNA in MMQ" ON)

View File

@@ -1,36 +0,0 @@
# cmake/FindNCCL.cmake
# NVIDIA does not distribute CMake files with NCCl, therefore use this file to find it instead.
find_path(NCCL_INCLUDE_DIR
NAMES nccl.h
HINTS ${NCCL_ROOT} $ENV{NCCL_ROOT} $ENV{CUDA_HOME} /usr/local/cuda
PATH_SUFFIXES include
)
find_library(NCCL_LIBRARY
NAMES nccl
HINTS ${NCCL_ROOT} $ENV{NCCL_ROOT} $ENV{CUDA_HOME} /usr/local/cuda
PATH_SUFFIXES lib lib64
)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(NCCL
DEFAULT_MSG
NCCL_LIBRARY NCCL_INCLUDE_DIR
)
if(NCCL_FOUND)
set(NCCL_LIBRARIES ${NCCL_LIBRARY})
set(NCCL_INCLUDE_DIRS ${NCCL_INCLUDE_DIR})
if(NOT TARGET NCCL::NCCL)
add_library(NCCL::NCCL UNKNOWN IMPORTED)
set_target_properties(NCCL::NCCL PROPERTIES
IMPORTED_LOCATION "${NCCL_LIBRARY}"
INTERFACE_INCLUDE_DIRECTORIES "${NCCL_INCLUDE_DIR}"
)
endif()
endif()
mark_as_advanced(NCCL_INCLUDE_DIR NCCL_LIBRARY)

View File

@@ -68,7 +68,7 @@ extern "C" {
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
// tensor copy between different backends
GGML_API void ggml_backend_tensor_copy(const struct ggml_tensor * src, struct ggml_tensor * dst);
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
//
// Backend (stream)
@@ -83,17 +83,13 @@ extern "C" {
GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
GGML_API size_t ggml_backend_get_max_size(ggml_backend_t backend);
GGML_API void ggml_backend_tensor_set_async (ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get_async (ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_set_2d_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
GGML_API void ggml_backend_tensor_get_2d_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
// "offset" refers to the offset in tensor->data for setting/getting data
GGML_API void ggml_backend_tensor_set ( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get (const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_set_2d( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
GGML_API void ggml_backend_tensor_get_2d(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
GGML_API void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
@@ -113,7 +109,7 @@ extern "C" {
// the copy is performed after all the currently queued operations in backend_src
// backend_dst will wait for the copy to complete before performing other operations
// automatic fallback to sync copy if async is not supported
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend);
@@ -139,9 +135,7 @@ extern "C" {
// integrated GPU device using host memory
GGML_BACKEND_DEVICE_TYPE_IGPU,
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
GGML_BACKEND_DEVICE_TYPE_ACCEL,
// "meta" device wrapping multiple other devices for tensor parallelism
GGML_BACKEND_DEVICE_TYPE_META,
GGML_BACKEND_DEVICE_TYPE_ACCEL
};
// functionality supported by the device
@@ -202,9 +196,7 @@ extern "C" {
// Common functions that may be obtained using ggml_backend_reg_get_proc_address
// AllReduce operation for tensor parallelism (meta backend)
typedef bool (*ggml_backend_allreduce_tensor_t)(ggml_backend_t * backends, struct ggml_tensor ** tensors, size_t n_backends);
// Split buffer type for tensor parallelism (old)
// Split buffer type for tensor parallelism
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
// Set the number of threads for the backend
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);

View File

@@ -27,9 +27,6 @@ GGML_BACKEND_API bool ggml_backend_is_cuda(ggml_backend_t backend);
// device buffer
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
// conduct allreduce operation between devices
GGML_BACKEND_API bool ggml_backend_cuda_allreduce_tensor(ggml_backend_t * backends, struct ggml_tensor ** tensors, size_t n_backends);
// split tensor buffer that splits matrices by rows across multiple devices
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);

View File

@@ -200,7 +200,6 @@ add_library(ggml-base
ggml.cpp
ggml-alloc.c
ggml-backend.cpp
ggml-backend-meta.cpp
ggml-opt.cpp
ggml-threading.cpp
ggml-threading.h

View File

@@ -1236,9 +1236,6 @@ size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx,
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
size_t nbytes_total = 0;
if (ggml_backend_buft_is_meta(buft)) {
return ggml_backend_meta_alloc_ctx_tensors_from_buft(ctx, buft);
}
return ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc =*/ false);
}

View File

@@ -49,10 +49,6 @@ extern "C" {
void (*memset_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
// (optional) 2d data copies
void (*set_tensor_2d)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
void (*get_tensor_2d)(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
// (optional) tensor copy: dst is in the buffer, src may be in any buffer, including buffers from a different backend (return false if not supported)
bool (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst);
// clear the entire buffer
@@ -84,20 +80,6 @@ extern "C" {
GGML_API bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
//
// Backend (meta)
//
GGML_API bool ggml_backend_is_meta (ggml_backend_t backend);
GGML_API bool ggml_backend_buffer_is_meta(ggml_backend_buffer_t buf);
GGML_API bool ggml_backend_buft_is_meta (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_meta_n_backends (ggml_backend_t meta_backend);
GGML_API ggml_backend_t ggml_backend_meta_simple_backend(ggml_backend_t meta_backend, size_t index);
// temporary workaround to statically allocate tensors from a context in a deduplicated way:
GGML_API struct ggml_backend_buffer * ggml_backend_meta_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
//
// Backend (stream)
//
@@ -108,10 +90,8 @@ extern "C" {
void (*free)(ggml_backend_t backend);
// (optional) asynchronous tensor data access
void (*set_tensor_async) (ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*get_tensor_async) (ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
void (*set_tensor_2d_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
void (*get_tensor_2d_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
// (optional) complete all pending operations (required if the backend supports async operations)

File diff suppressed because it is too large Load Diff

View File

@@ -123,7 +123,7 @@ size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
GGML_ASSERT(buffer);
// get_base is optional if the buffer is zero-sized
if (!ggml_backend_buffer_is_meta(buffer) && buffer->size == 0) {
if (buffer->size == 0) {
return NULL;
}
@@ -279,57 +279,15 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
}
}
void ggml_backend_tensor_set_2d_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size,
size_t n_copies, size_t stride_tensor, size_t stride_data) {
GGML_ASSERT(backend);
GGML_ASSERT(tensor);
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
if (n_copies <= 1 || backend->iface.set_tensor_2d_async == NULL) {
for (size_t i = 0; i < n_copies; i++) {
ggml_backend_tensor_set_async(backend, tensor, (const char *) data + i*stride_data, offset + i*stride_tensor, size);
}
return;
}
if (size == 0) {
return;
}
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + (n_copies-1)*stride_tensor + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
backend->iface.set_tensor_2d_async(backend, tensor, data, offset, size, n_copies, stride_tensor, stride_data);
}
void ggml_backend_tensor_get_2d_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size,
size_t n_copies, size_t stride_tensor, size_t stride_data) {
GGML_ASSERT(backend);
GGML_ASSERT(tensor);
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
if (n_copies <= 1 || backend->iface.set_tensor_2d_async == NULL) {
for (size_t i = 0; i < n_copies; i++) {
ggml_backend_tensor_get_async(backend, tensor, (char *) data + i*stride_data, offset + i*stride_tensor, size);
}
return;
}
if (size == 0) {
return;
}
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + (n_copies-1)*stride_tensor + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
backend->iface.get_tensor_2d_async(backend, tensor, data, offset, size, n_copies, stride_tensor, stride_data);
}
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
GGML_ASSERT(tensor);
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
GGML_ASSERT(buf != NULL && "tensor buffer not set");
if (size == 0) {
return;
}
GGML_ASSERT(buf != NULL && "tensor buffer not set");
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
@@ -339,62 +297,18 @@ void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, siz
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
GGML_ASSERT(tensor);
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
GGML_ASSERT(buf != NULL && "tensor buffer not set");
if (size == 0) {
return;
}
GGML_ASSERT(buf != NULL && "tensor buffer not set");
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
buf->iface.get_tensor(buf, tensor, data, offset, size);
}
void ggml_backend_tensor_set_2d(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size,
size_t n_copies, size_t stride_tensor, size_t stride_data) {
GGML_ASSERT(tensor);
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
GGML_ASSERT(buf != NULL && "tensor buffer not set");
if (n_copies <= 1 || buf->iface.set_tensor_2d == NULL) {
for (size_t i = 0; i < n_copies; i++) {
ggml_backend_tensor_set(tensor, (const char *) data + i*stride_data, offset + i*stride_tensor, size);
}
return;
}
if (size == 0) {
return;
}
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + (n_copies-1)*stride_tensor + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
buf->iface.set_tensor_2d(buf, tensor, data, offset, size, n_copies, stride_tensor, stride_data);
}
void ggml_backend_tensor_get_2d(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size,
size_t n_copies, size_t stride_tensor, size_t stride_data) {
GGML_ASSERT(tensor);
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
GGML_ASSERT(buf != NULL && "tensor buffer not set");
if (n_copies <= 1 || buf->iface.set_tensor_2d == NULL) {
for (size_t i = 0; i < n_copies; i++) {
ggml_backend_tensor_get(tensor, (char *) data + i*stride_data, offset + i*stride_tensor, size);
}
return;
}
if (size == 0) {
return;
}
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + (n_copies-1)*stride_tensor + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
buf->iface.get_tensor_2d(buf, tensor, data, offset, size, n_copies, stride_tensor, stride_data);
}
void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
GGML_ASSERT(tensor);
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
@@ -474,7 +388,7 @@ ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
// backend copy
void ggml_backend_tensor_copy(const struct ggml_tensor * src, struct ggml_tensor * dst) {
void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
if (src == dst) {
@@ -488,7 +402,7 @@ void ggml_backend_tensor_copy(const struct ggml_tensor * src, struct ggml_tensor
} else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
#endif // NDEBUG
#endif
size_t nbytes = ggml_nbytes(src);
void * data = malloc(nbytes);
ggml_backend_tensor_get(src, data, 0, nbytes);
@@ -497,7 +411,7 @@ void ggml_backend_tensor_copy(const struct ggml_tensor * src, struct ggml_tensor
}
}
void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst) {
void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst) {
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
if (src == dst) {
@@ -586,7 +500,6 @@ enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
}
void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) {
GGML_ASSERT(device);
memset(props, 0, sizeof(*props));
device->iface.get_props(device, props);
}
@@ -697,8 +610,6 @@ static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = {
/* .memset_tensor = */ NULL,
/* .set_tensor = */ NULL,
/* .get_tensor = */ NULL,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ NULL,
/* .clear = */ ggml_backend_multi_buffer_clear,
/* .reset = */ NULL,
@@ -1988,9 +1899,8 @@ enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct
GGML_ASSERT(tensor->data == NULL);
GGML_ASSERT(tensor->view_src == NULL);
GGML_ASSERT(addr >= ggml_backend_buffer_get_base(buffer));
GGML_ASSERT(ggml_backend_buffer_is_meta(buffer) ||
(char *) addr + ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
(char *) ggml_backend_buffer_get_base(buffer) + ggml_backend_buffer_get_size(buffer));
GGML_ASSERT((char *)addr + ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
(char *)ggml_backend_buffer_get_base(buffer) + ggml_backend_buffer_get_size(buffer));
tensor->buffer = buffer;
tensor->data = addr;
@@ -2264,8 +2174,6 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
/* .clear = */ ggml_backend_cpu_buffer_clear,
/* .reset = */ NULL,
@@ -2278,8 +2186,6 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
/* .clear = */ ggml_backend_cpu_buffer_clear,
/* .reset = */ NULL,

View File

@@ -262,8 +262,6 @@ static struct ggml_backend_i blas_backend_i = {
/* .get_name = */ ggml_backend_blas_get_name,
/* .free = */ ggml_backend_blas_free,
/* .set_tensor_async = */ NULL,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .synchronize = */ NULL,

View File

@@ -1457,8 +1457,6 @@ static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
/* .memset_tensor = */ NULL,
/* .set_tensor = */ ggml_backend_cann_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_cann_buffer_get_tensor,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ ggml_backend_cann_buffer_cpy_tensor,
/* .clear = */ ggml_backend_cann_buffer_clear,
/* .reset = */ NULL,
@@ -2700,8 +2698,6 @@ static const ggml_backend_i ggml_backend_cann_interface = {
/* .free = */ ggml_backend_cann_free,
/* .set_tensor_async = */ ggml_backend_cann_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_cann_get_tensor_async,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ ggml_backend_cann_cpy_tensor_async,
/* .synchronize = */ ggml_backend_cann_synchronize,
/* .graph_plan_create = */ NULL,

View File

@@ -111,8 +111,6 @@ static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
/* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_amx_buffer_set_tensor,
/* .get_tensor = */ nullptr,
/* .set_tensor_2d = */ nullptr,
/* .get_tensor_2d = */ nullptr,
/* .cpy_tensor = */ nullptr,
/* .clear = */ ggml_backend_amx_buffer_clear,
/* .reset = */ nullptr,

View File

@@ -195,8 +195,6 @@ static const struct ggml_backend_i ggml_backend_cpu_i = {
/* .free = */ ggml_backend_cpu_free,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,

View File

@@ -664,7 +664,6 @@ void ggml_compute_forward_add(
{
ggml_compute_forward_add_non_quantized(params, dst);
} break;
case GGML_TYPE_Q1_0:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
@@ -1114,7 +1113,6 @@ void ggml_compute_forward_add1(
GGML_ABORT("fatal error");
}
} break;
case GGML_TYPE_Q1_0:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
@@ -1244,7 +1242,6 @@ void ggml_compute_forward_acc(
} break;
case GGML_TYPE_F16:
case GGML_TYPE_BF16:
case GGML_TYPE_Q1_0:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
@@ -4334,7 +4331,6 @@ void ggml_compute_forward_out_prod(
const ggml_tensor * src0 = dst->src[0];
switch (src0->type) {
case GGML_TYPE_Q1_0:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
@@ -4610,7 +4606,6 @@ void ggml_compute_forward_set(
} break;
case GGML_TYPE_F16:
case GGML_TYPE_BF16:
case GGML_TYPE_Q1_0:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:

View File

@@ -181,16 +181,6 @@ if (CUDAToolkit_FOUND)
target_link_libraries(ggml-cuda PRIVATE CUDA::cuda_driver)
endif()
if (GGML_CUDA_NCCL)
find_package(NCCL)
if (NCCL_FOUND)
add_compile_definitions(GGML_USE_NCCL)
target_link_libraries(ggml-cuda PRIVATE NCCL::NCCL)
else()
message(STATUS "Warning: NCCL not found, performance for multiple CUDA GPUs will be suboptimal")
endif()
endif()
set(CUDA_CXX_FLAGS "")
set(CUDA_FLAGS -use_fast_math -extended-lambda)

View File

@@ -58,48 +58,26 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
size_t temp_storage_bytes = 0;
bool is_capturing = false;
#ifdef USE_CUDA_GRAPH
// Currently (confirmed for CCCL <= 3.2) DeviceSegmentedSort does not support stream capture, while DeviceSegmentedRadixSort does.
// See https://github.com/NVIDIA/cccl/issues/5661#issuecomment-3229037149
// TODO: constrain this to the CCCL versions that have this issue once it's resolved in a future CCCL release.
cudaStreamCaptureStatus capture_status;
CUDA_CHECK(cudaStreamIsCapturing(stream, &capture_status));
is_capturing = (capture_status != cudaStreamCaptureStatusNone);
#endif // USE_CUDA_GRAPH
if (order == GGML_SORT_ORDER_ASC) {
if (nrows == 1) {
CUDA_CHECK(DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place)
temp_indices, dst, // values (indices)
ncols, 0, sizeof(float) * 8, stream));
} else if (is_capturing) {
CUDA_CHECK(DeviceSegmentedRadixSort::SortPairs(
nullptr, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place)
temp_indices, dst, // values (indices)
ncols * nrows, nrows, // num items, num segments
offset_iterator, offset_iterator + 1, 0, sizeof(float) * 8, stream));
DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place)
temp_indices, dst, // values (indices)
ncols, 0, sizeof(float) * 8, stream);
} else {
CUDA_CHECK(DeviceSegmentedSort::SortPairs(nullptr, temp_storage_bytes, temp_keys,
temp_keys, // keys (in-place)
temp_indices, dst, // values (indices)
ncols * nrows, nrows, // num items, num segments
offset_iterator, offset_iterator + 1, stream));
DeviceSegmentedSort::SortPairs(nullptr, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place)
temp_indices, dst, // values (indices)
ncols * nrows, nrows, // num items, num segments
offset_iterator, offset_iterator + 1, stream);
}
} else {
if (nrows == 1) {
CUDA_CHECK(DeviceRadixSort::SortPairsDescending(nullptr, temp_storage_bytes, temp_keys,
temp_keys, // keys (in-place)
temp_indices, dst, // values (indices)
ncols, 0, sizeof(float) * 8, stream));
} else if (is_capturing) {
CUDA_CHECK(DeviceSegmentedRadixSort::SortPairsDescending(
nullptr, temp_storage_bytes, temp_keys, temp_keys, temp_indices, dst, ncols * nrows, nrows,
offset_iterator, offset_iterator + 1, 0, sizeof(float) * 8, stream));
DeviceRadixSort::SortPairsDescending(nullptr, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place)
temp_indices, dst, // values (indices)
ncols, 0, sizeof(float) * 8, stream);
} else {
CUDA_CHECK(DeviceSegmentedSort::SortPairsDescending(nullptr, temp_storage_bytes, temp_keys, temp_keys,
temp_indices, dst, ncols * nrows, nrows,
offset_iterator, offset_iterator + 1, stream));
DeviceSegmentedSort::SortPairsDescending(nullptr, temp_storage_bytes, temp_keys, temp_keys, temp_indices,
dst, ncols * nrows, nrows, offset_iterator, offset_iterator + 1,
stream);
}
}
@@ -108,33 +86,22 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
if (order == GGML_SORT_ORDER_ASC) {
if (nrows == 1) {
CUDA_CHECK(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, temp_keys,
temp_keys, // keys (in-place)
temp_indices, dst, // values (indices)
ncols, 0, sizeof(float) * 8, stream));
} else if (is_capturing) {
CUDA_CHECK(DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys,
temp_indices, dst, ncols * nrows, nrows, offset_iterator,
offset_iterator + 1, 0, sizeof(float) * 8, stream));
DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place)
temp_indices, dst, // values (indices)
ncols, 0, sizeof(float) * 8, stream);
} else {
CUDA_CHECK(DeviceSegmentedSort::SortPairs(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys,
temp_indices, dst, ncols * nrows, nrows, offset_iterator,
offset_iterator + 1, stream));
DeviceSegmentedSort::SortPairs(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, temp_indices, dst,
ncols * nrows, nrows, offset_iterator, offset_iterator + 1, stream);
}
} else {
if (nrows == 1) {
CUDA_CHECK(DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, temp_keys,
temp_keys, // keys (in-place)
temp_indices, dst, // values (indices)
ncols, 0, sizeof(float) * 8, stream));
} else if (is_capturing) {
CUDA_CHECK(DeviceSegmentedRadixSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, temp_indices, dst, ncols * nrows, nrows,
offset_iterator, offset_iterator + 1, 0, sizeof(float) * 8, stream));
DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place)
temp_indices, dst, // values (indices)
ncols, 0, sizeof(float) * 8, stream);
} else {
CUDA_CHECK(DeviceSegmentedSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, temp_keys,
temp_keys, temp_indices, dst, ncols * nrows, nrows,
offset_iterator, offset_iterator + 1, stream));
DeviceSegmentedSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys,
temp_indices, dst, ncols * nrows, nrows, offset_iterator,
offset_iterator + 1, stream);
}
}
}

View File

@@ -472,36 +472,6 @@ void ggml_cuda_op_fused_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst,
}
}
void ggml_cuda_op_fused_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst, int n_fuse) {
GGML_ASSERT(2 <= n_fuse && n_fuse <= 8);
switch (n_fuse) {
case 2:
ggml_cuda_op_fused_binbcast_impl<op_mul, 2>(ctx, dst);
break;
case 3:
ggml_cuda_op_fused_binbcast_impl<op_mul, 3>(ctx, dst);
break;
case 4:
ggml_cuda_op_fused_binbcast_impl<op_mul, 4>(ctx, dst);
break;
case 5:
ggml_cuda_op_fused_binbcast_impl<op_mul, 5>(ctx, dst);
break;
case 6:
ggml_cuda_op_fused_binbcast_impl<op_mul, 6>(ctx, dst);
break;
case 7:
ggml_cuda_op_fused_binbcast_impl<op_mul, 7>(ctx, dst);
break;
case 8:
ggml_cuda_op_fused_binbcast_impl<op_mul, 8>(ctx, dst);
break;
default:
GGML_ASSERT(false && "Unsupported n_fuse value");
}
}
void ggml_cuda_op_repeat_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];

View File

@@ -9,4 +9,3 @@ void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_repeat_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_fused_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst, int n_fuse);
void ggml_cuda_op_fused_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst, int n_fuse);

View File

@@ -67,7 +67,6 @@
#define GGML_CUDA_CC_CDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x90a) // MI210 (gfx90a), minimum acc register renaming
#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300
#define GGML_CUDA_CC_CDNA4 (GGML_CUDA_CC_OFFSET_AMD + 0x950) // MI350X/MI355X
// RDNA removes MFMA, dp4a, xnack, acc registers, wave size is 32
#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
@@ -88,8 +87,7 @@
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
#define GGML_CUDA_CC_IS_CDNA1(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_CDNA2)
#define GGML_CUDA_CC_IS_CDNA2(cc) (cc >= GGML_CUDA_CC_CDNA2 && cc < GGML_CUDA_CC_CDNA3)
#define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_CDNA4)
#define GGML_CUDA_CC_IS_CDNA4(cc) (cc >= GGML_CUDA_CC_CDNA4 && cc < GGML_CUDA_CC_RDNA1)
#define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)
// Moore Threads
#define MUSART_HMASK 40300 // MUSA rc4.3, min. ver. for half2 -> uint mask comparisons
@@ -188,10 +186,6 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
#ifdef GGML_USE_NCCL
#define NCCL_CHECK(err) CUDA_CHECK_GEN(err, ncclSuccess, ncclGetErrorString)
#endif // GGML_USE_NCCL
#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
static const char * cu_get_error_str(CUresult err) {
const char * err_str;
@@ -1092,10 +1086,6 @@ struct ggml_cuda_device_info {
cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {};
std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {};
#ifdef GGML_USE_NCCL
ncclComm_t comms[GGML_CUDA_MAX_DEVICES];
#endif // GGML_USE_NCCL
};
const ggml_cuda_device_info & ggml_cuda_info();
@@ -1183,13 +1173,7 @@ struct ggml_cuda_graph {
std::vector<cudaGraphNode_t> nodes;
bool disable_due_to_gpu_arch = false;
bool warmup_complete = false;
struct node_properties {
ggml_tensor node;
void * node_src_data_ptrs[GGML_MAX_SRC];
int64_t node_src_ne[GGML_MAX_SRC][GGML_MAX_DIMS];
size_t node_src_nb[GGML_MAX_SRC][GGML_MAX_DIMS];
};
std::vector<node_properties> node_props;
std::vector<ggml_tensor> nodes_copy;
bool is_enabled() const {
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);

View File

@@ -75,17 +75,13 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2(ggml_backend_cuda_con
return;
}
if constexpr (DKQ <= 256) {
if (use_gqa_opt && gqa_ratio % 2 == 0) {
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 2>(ctx, dst);
return;
}
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 1>(ctx, dst);
if (use_gqa_opt && gqa_ratio % 2 == 0) {
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 2>(ctx, dst);
return;
} else {
GGML_ABORT("fatal error");
}
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 1>(ctx, dst);
return;
}
if (use_gqa_opt && gqa_ratio > 4) {
@@ -98,16 +94,12 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2(ggml_backend_cuda_con
return;
}
if constexpr (DKQ <= 256) {
if (use_gqa_opt && gqa_ratio > 1) {
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 2>(ctx, dst);
return;
}
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 1>(ctx, dst);
} else {
GGML_ABORT("fatal error");
if (use_gqa_opt && gqa_ratio > 1) {
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 2>(ctx, dst);
return;
}
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 1>(ctx, dst);
}
static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

View File

@@ -324,28 +324,6 @@ static ggml_cuda_device_info ggml_cuda_init() {
// configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
for (int id = 0; id < info.device_count; ++id) {
ggml_cuda_set_device(id);
for (int id_other = 0; id_other < info.device_count; ++id_other) {
if (id == id_other) {
continue;
}
int can_access_peer;
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
if (can_access_peer) {
CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
}
}
}
#ifdef GGML_USE_NCCL
int dev_ids[GGML_CUDA_MAX_DEVICES];
for (int id = 0; id < info.device_count; ++id) {
dev_ids[id] = id;
}
NCCL_CHECK(ncclCommInitAll(info.comms, info.device_count, dev_ids));
#endif // GGML_USE_NCCL
return info;
}
@@ -654,46 +632,26 @@ static enum ggml_status ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer
}
static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *) buffer->context;
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
ggml_cuda_set_device(ctx->device);
CUDA_CHECK(cudaMemsetAsync((char *) tensor->data + offset, value, size, cudaStreamPerThread));
CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + offset, value, size, cudaStreamPerThread));
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
}
static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *) buffer->context;
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
ggml_cuda_set_device(ctx->device);
CUDA_CHECK(cudaMemcpyAsync((char *) tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
}
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *) buffer->context;
ggml_cuda_set_device(ctx->device);
CUDA_CHECK(cudaMemcpyAsync(data, (const char *) tensor->data + offset, size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
}
static void ggml_backend_cuda_buffer_set_tensor_2d(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data,
size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *) buffer->context;
ggml_cuda_set_device(ctx->device);
CUDA_CHECK(cudaMemcpy2DAsync(
(char *) tensor->data + offset, stride_tensor, data, stride_data, size, n_copies, cudaMemcpyHostToDevice, cudaStreamPerThread));
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
}
static void ggml_backend_cuda_buffer_get_tensor_2d(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data,
size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
ggml_cuda_set_device(ctx->device);
CUDA_CHECK(cudaMemcpy2DAsync(
data, stride_data, (const char *) tensor->data + offset, stride_tensor, size, n_copies, cudaMemcpyDeviceToHost, cudaStreamPerThread));
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
}
@@ -733,8 +691,6 @@ static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
/* .memset_tensor = */ ggml_backend_cuda_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
/* .set_tensor_2d = */ ggml_backend_cuda_buffer_set_tensor_2d,
/* .get_tensor_2d = */ ggml_backend_cuda_buffer_get_tensor_2d,
/* .cpy_tensor = */ ggml_backend_cuda_buffer_cpy_tensor,
/* .clear = */ ggml_backend_cuda_buffer_clear,
/* .reset = */ NULL,
@@ -1047,8 +1003,6 @@ static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
/* .memset_tensor = */ NULL,
/* .set_tensor = */ ggml_backend_cuda_split_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_cuda_split_buffer_get_tensor,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ NULL,
/* .clear = */ ggml_backend_cuda_split_buffer_clear,
/* .reset = */ NULL,
@@ -1125,83 +1079,6 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_inte
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
};
bool ggml_backend_cuda_allreduce_tensor(ggml_backend_t * backends, struct ggml_tensor ** tensors, size_t n_backends) {
#ifdef GGML_USE_NCCL
const int64_t ne = ggml_nelements(tensors[0]);
// FIXME the input of llm_graph_context::build_in_out_ids can produce a tensor with 0 elements if n_outputs == 0
// This then causes a crash in this function
if (ne == 0) {
return true;
}
for (size_t i = 0; i < n_backends; ++i) {
GGML_ASSERT(tensors[i] != nullptr);
GGML_ASSERT(ggml_nelements(tensors[i]) == ne);
GGML_ASSERT(ggml_is_contiguously_allocated(tensors[i]));
}
const ggml_cuda_device_info info = ggml_cuda_info();
// For small tensors, simply reduce them as FP32.
// The following heuristic for how "small" a tensor should be is based on RTX 4090s connected via 16x PCIe 4.0.
if ((n_backends <= 2 && ne < 32768) || (n_backends == 3 && ne < 131072) || (n_backends >= 4 && ne < 262144)) {
NCCL_CHECK(ncclGroupStart());
for (size_t i = 0; i < n_backends; ++i) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
NCCL_CHECK(ncclAllReduce(tensors[i]->data, tensors[i]->data, ne, ncclFloat, ncclSum, info.comms[cuda_ctx->device], cuda_ctx->stream()));
}
NCCL_CHECK(ncclGroupEnd());
return true;
}
// For large tensors it's faster to compress them to BF16 for the reduction:
to_bf16_cuda_t to_bf16 = ggml_get_to_bf16_cuda(GGML_TYPE_F32);
to_fp32_cuda_t to_fp32 = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
ggml_cuda_pool_alloc<nv_bfloat16> tmp[GGML_CUDA_MAX_DEVICES];
for (size_t i = 0; i < n_backends; ++i) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
tmp[i].pool = &cuda_ctx->pool();
tmp[i].alloc(ne);
ggml_cuda_set_device(i);
to_bf16(tensors[i]->data, tmp[i].get(), ne, cuda_ctx->stream());
CUDA_CHECK(cudaGetLastError());
}
NCCL_CHECK(ncclGroupStart());
for (size_t i = 0; i < n_backends; ++i) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
NCCL_CHECK(ncclAllReduce(tmp[i].get(), tmp[i].get(), ne, ncclBfloat16, ncclSum, info.comms[cuda_ctx->device], cuda_ctx->stream()));
}
NCCL_CHECK(ncclGroupEnd());
for (size_t i = 0; i < n_backends; ++i) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
ggml_cuda_set_device(i);
to_fp32(tmp[i].get(), (float *) tensors[i]->data, ne, cuda_ctx->stream());
CUDA_CHECK(cudaGetLastError());
}
return true;
#else
// If NCCL is installed it is used by default for optimal performance.
// However, NVIDIA does not distribute NCCL with CUDA so users may be unwittingly missing this package.
// RCCL is disabled by default, users are explicitly opting in.
// Therefore print no warning for RCCL.
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
static bool warning_printed = false;
if (!warning_printed) {
GGML_LOG_WARN("%s: NVIDIA Collective Communications Library (NCCL) is unavailable, multi GPU performance will be suboptimal\n", __func__);
warning_printed = true;
}
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
GGML_UNUSED_VARS(backends, tensors, n_backends);
return false;
#endif // GGML_USE_NCCL
}
ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split) {
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
@@ -1548,6 +1425,64 @@ static void ggml_cuda_op_mul_mat_cublas(
GGML_UNUSED_VARS(dst, src1_ddq_i, src1_padded_row_size);
}
static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
static bool peer_access_enabled = false;
const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
if (peer_access_enabled == enable_peer_access) {
return;
}
#ifdef NDEBUG
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
ggml_cuda_set_device(id);
CUDA_CHECK(cudaDeviceSynchronize());
}
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
ggml_cuda_set_device(id);
for (int id_other = 0; id_other < ggml_backend_cuda_get_device_count(); ++id_other) {
if (id == id_other) {
continue;
}
if (id != main_device && id_other != main_device) {
continue;
}
int can_access_peer;
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
if (can_access_peer) {
if (enable_peer_access) {
cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0);
if (err != cudaErrorPeerAccessAlreadyEnabled) {
CUDA_CHECK(err);
} else {
// reset the error
(void)cudaGetLastError();
}
} else {
cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
if (err != cudaErrorPeerAccessNotEnabled) {
CUDA_CHECK(err);
} else {
// reset the error
(void)cudaGetLastError();
}
}
}
}
}
ggml_cuda_set_device(main_device);
#endif // NDEBUG
peer_access_enabled = enable_peer_access;
GGML_UNUSED(main_device);
}
static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
@@ -2548,6 +2483,11 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
}
static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
// why is this here instead of mul_mat?
if (dst->src[0] != nullptr && ggml_backend_buft_is_cuda_split(dst->src[0]->buffer->buft)) {
ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device);
}
switch (dst->op) {
case GGML_OP_ARGMAX:
ggml_cuda_argmax(ctx, dst);
@@ -2905,43 +2845,21 @@ static void ggml_backend_cuda_free(ggml_backend_t backend) {
}
static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
CUDA_CHECK(cudaMemcpyAsync((char *) tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream()));
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream()));
}
static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
CUDA_CHECK(cudaMemcpyAsync(data, (const char *) tensor->data + offset, size, cudaMemcpyDeviceToHost, cuda_ctx->stream()));
}
static void ggml_backend_cuda_set_tensor_2d_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data,
size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
CUDA_CHECK(cudaMemcpy2DAsync(
(char *) tensor->data + offset, stride_tensor, data, stride_data, size, n_copies, cudaMemcpyHostToDevice, cuda_ctx->stream()));
}
static void ggml_backend_cuda_get_tensor_2d_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data,
size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
CUDA_CHECK(cudaMemcpy2DAsync(
data, stride_data, (const char *) tensor->data + offset, stride_tensor, size, n_copies, cudaMemcpyDeviceToHost, cuda_ctx->stream()));
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cuda_ctx->stream()));
}
static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
@@ -2952,21 +2870,21 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
return false;
}
if (!ggml_backend_buffer_is_cuda(buf_src) || !ggml_backend_buffer_is_cuda(buf_dst)) {
if (!ggml_backend_buffer_is_cuda(src->buffer) || !ggml_backend_buffer_is_cuda(dst->buffer)) {
return false;
}
// device -> device copy
ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *) backend_src->context;
ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *) backend_dst->context;
ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *) buf_src->context;
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *) buf_dst->context;
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__);
#endif // NDEBUG
#endif
return false;
}
@@ -2979,7 +2897,7 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
return false;
#else
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
#endif // GGML_CUDA_NO_PEER_COPY
#endif
}
// record event on src stream after the copy
@@ -3061,27 +2979,18 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx
ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key);
// Check if the graph size has changed
if ((int)graph->node_props.size() != cgraph->n_nodes) {
if ((int)graph->nodes_copy.size() != cgraph->n_nodes) {
res = true;
graph->node_props.resize(cgraph->n_nodes);
graph->nodes_copy.resize(cgraph->n_nodes);
}
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_cuda_graph::node_properties prop = {};
memcpy(&prop.node, cgraph->nodes[i], sizeof(ggml_tensor));
for (int j = 0; j < GGML_MAX_SRC; ++j) {
if (cgraph->nodes[i]->src[j]) {
prop.node_src_data_ptrs[j] = cgraph->nodes[i]->src[j]->data;
memcpy(prop.node_src_ne[j], cgraph->nodes[i]->src[j]->ne, sizeof(prop.node_src_ne[j]));
memcpy(prop.node_src_nb[j], cgraph->nodes[i]->src[j]->nb, sizeof(prop.node_src_nb[j]));
if (!res) {
if (memcmp(&graph->nodes_copy[i], cgraph->nodes[i], sizeof(ggml_tensor)) != 0) {
res = true;
}
}
if (res || memcmp(&graph->node_props[i], &prop, sizeof(prop)) != 0) {
graph->node_props[i] = prop;
res = true;
}
memcpy(&graph->nodes_copy[i], cgraph->nodes[i], sizeof(ggml_tensor));
}
return res;
@@ -3760,10 +3669,10 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
continue;
}
if (node->op == GGML_OP_ADD || node->op == GGML_OP_MUL) {
if (node->op == GGML_OP_ADD) {
int n_fuse = 0;
ggml_op ops[8];
std::fill(ops, ops + 8, node->op);
std::fill(ops, ops + 8, GGML_OP_ADD);
for (; n_fuse <= 6; ++n_fuse){
if (!ggml_can_fuse(cgraph, i + n_fuse, ops + n_fuse, 2)) {
@@ -3780,17 +3689,13 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
n_fuse++;
if (n_fuse > 1) {
ggml_tensor fused_node;
memcpy(&fused_node, node, sizeof(ggml_tensor));
ggml_tensor fused_add_node;
memcpy(&fused_add_node, node, sizeof(ggml_tensor));
for (int j = 0; j < n_fuse - 1; ++j) {
fused_node.src[j + 2] = cgraph->nodes[i + j + 1]->src[1];
}
fused_node.data = cgraph->nodes[i + n_fuse - 1]->data;
if (node->op == GGML_OP_ADD) {
ggml_cuda_op_fused_add(*cuda_ctx, &fused_node, n_fuse);
} else {
ggml_cuda_op_fused_mul(*cuda_ctx, &fused_node, n_fuse);
fused_add_node.src[j + 2] = cgraph->nodes[i + j + 1]->src[1];
}
fused_add_node.data = cgraph->nodes[i + n_fuse - 1]->data;
ggml_cuda_op_fused_add(*cuda_ctx, &fused_add_node, n_fuse);
i += n_fuse - 1;
continue;
@@ -4431,8 +4336,6 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
/* .free = */ ggml_backend_cuda_free,
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
/* .get_tensor_2d_async = */ ggml_backend_cuda_set_tensor_2d_async,
/* .set_tensor_2d_async = */ ggml_backend_cuda_get_tensor_2d_async,
/* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
/* .synchronize = */ ggml_backend_cuda_synchronize,
/* .graph_plan_create = */ NULL,
@@ -5220,9 +5123,6 @@ static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t
static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
GGML_UNUSED(reg);
if (strcmp(name, "ggml_backend_allreduce_tensor") == 0) {
return (void *)ggml_backend_cuda_allreduce_tensor;
}
if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
return (void *)ggml_backend_cuda_split_buffer_type;
}

View File

@@ -1025,8 +1025,7 @@ namespace ggml_cuda_mma {
const floatx2_t& a_frag = reinterpret_cast<const floatx2_t&>(A.x[0]);
const floatx2_t& b_frag = reinterpret_cast<const floatx2_t&>(B.x[0]);
acc_frag = __builtin_amdgcn_mfma_f32_16x16x8_xf32(a_frag, b_frag, acc_frag, 0, 0, 0);
#elif defined(CDNA4) || defined(CDNA2) || defined(CDNA1)
// CDNA4 (gfx950) does not support xf32 MFMA, use f32 path like CDNA2/CDNA1
#elif defined(CDNA2) || defined(CDNA1)
#pragma unroll
for (int i = 0; i < 2; ++i) {
acc_frag = __builtin_amdgcn_mfma_f32_16x16x4f32(A.x[i], B.x[i], acc_frag, 0, 0, 0);
@@ -1188,7 +1187,7 @@ namespace ggml_cuda_mma {
#elif defined(AMD_MFMA_AVAILABLE)
using floatx4_t = __attribute__((ext_vector_type(4))) float;
floatx4_t& acc_frag = reinterpret_cast<floatx4_t&>(D.x[0]);
#if defined(CDNA4) || defined(CDNA3) || defined(CDNA2)
#if defined(CDNA3) || defined(CDNA2)
using bf16x4_t = __attribute__((ext_vector_type(4))) __bf16;
const bf16x4_t& a_frag = reinterpret_cast<const bf16x4_t&>(A.x[0]);
const bf16x4_t& b_frag = reinterpret_cast<const bf16x4_t&>(B.x[0]);
@@ -1217,12 +1216,12 @@ namespace ggml_cuda_mma {
#if defined(AMD_MFMA_AVAILABLE)
using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
int32x4_t * acc = (int32x4_t *) D.x;
#if defined(CDNA4) || defined(CDNA3)
#if defined(CDNA3)
acc[0] = __builtin_amdgcn_mfma_i32_16x16x32_i8(((int64_t *) A.x)[0],
((int64_t *) B.x)[0],
acc[0],
0, 0, 0);
#elif defined(CDNA2) || defined(CDNA1)
#elif defined(CDNA2) || defined(CDNA)
acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[0],
B.x[0],
acc[0],
@@ -1231,7 +1230,7 @@ namespace ggml_cuda_mma {
B.x[1],
acc[0],
0, 0, 0);
#endif // defined(CDNA4) || defined(CDNA3)
#endif // defined(CDNA3)
#elif defined(AMD_WMMA_AVAILABLE)
@@ -1296,12 +1295,12 @@ namespace ggml_cuda_mma {
#if defined(AMD_MFMA_AVAILABLE)
using int32x16_t = __attribute__((__vector_size__(16 * sizeof(int)))) int;
int32x16_t * acc = (int32x16_t *) D.x;
#if defined(CDNA4) || defined(CDNA3)
#if defined(CDNA3)
acc[0] = __builtin_amdgcn_mfma_i32_32x32x16_i8(((int64_t *) A.x)[0],
((int64_t *) B.x)[0],
acc[0],
0, 0, 0);
#elif defined(CDNA2) || defined(CDNA1)
#elif defined(CDNA2) || defined(CDNA)
acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[0],
B.x[0],
acc[0],
@@ -1310,7 +1309,7 @@ namespace ggml_cuda_mma {
B.x[1],
acc[0],
0, 0, 0);
#endif // defined(CDNA4) || defined(CDNA3)
#endif // defined(CDNA3)
#else
GGML_UNUSED_VARS(D, A, B);

View File

@@ -3645,7 +3645,7 @@ static __global__ void mul_mat_q(
tile_x_max_i, tile_y_max_j, 0, ncols_x/qk);
return;
}
#endif // (defined(GGML_USE_HIP) && !defined(CDNA4) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
#endif // (defined(GGML_USE_HIP) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
constexpr int ITER_K = get_iter_k(type);

View File

@@ -134,9 +134,8 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int
switch (nc) {
case 3: launch_kernel(std::integral_constant<int, 3>{}); break;
case 4: launch_kernel(std::integral_constant<int, 4>{}); break;
case 5: launch_kernel(std::integral_constant<int, 5>{}); break;
case 9: launch_kernel(std::integral_constant<int, 9>{}); break;
default: GGML_ABORT("Only support kernel sizes 3, 4, 5, 9 right now.");
default: GGML_ABORT("Only support kernel sizes 3, 4, 9 right now.");
}
}

View File

@@ -25,14 +25,14 @@ static void top_k_cub(ggml_cuda_pool & pool,
auto indexes_in = cuda::make_counting_iterator(0);
size_t temp_storage_bytes = 0;
CUDA_CHECK(DeviceTopK::MaxPairs(nullptr, temp_storage_bytes, src, cuda::discard_iterator(), indexes_in, dst, ncols, k,
env));
DeviceTopK::MaxPairs(nullptr, temp_storage_bytes, src, cuda::discard_iterator(), indexes_in, dst, ncols, k,
env);
ggml_cuda_pool_alloc<uint8_t> temp_storage_alloc(pool, temp_storage_bytes);
void * d_temp_storage = temp_storage_alloc.get();
CUDA_CHECK(DeviceTopK::MaxPairs(d_temp_storage, temp_storage_bytes, src, cuda::discard_iterator(), indexes_in, dst,
ncols, k, env));
DeviceTopK::MaxPairs(d_temp_storage, temp_storage_bytes, src, cuda::discard_iterator(), indexes_in, dst,
ncols, k, env);
}
#elif defined(GGML_CUDA_USE_CUB) // CUB_TOP_K_AVAILABLE

View File

@@ -6,10 +6,6 @@
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#ifdef GGML_USE_NCCL
#include <nccl.h>
#endif // GGML_USE_NCCL
#if CUDART_VERSION >= 11080
#include <cuda_fp8.h>
#define FP8_AVAILABLE

View File

@@ -10,11 +10,6 @@
#include <rocwmma/rocwmma-version.hpp>
#endif // defined(GGML_HIP_ROCWMMA_FATTN)
#ifdef GGML_USE_NCCL
#include <rccl/rccl.h>
#endif // GGML_USE_NCCL
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
#define CUBLAS_OP_N HIPBLAS_OP_N
@@ -33,7 +28,6 @@
#define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
#define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
#define NCCL_CHECK(fn) {ncclResult_t err = fn; if(err != ncclSuccess) { GGML_ABORT("RCCL Failure RCCL returned: %i\n", err); }}
#define __shfl_sync(mask, var, laneMask, width) __shfl(var, laneMask, width)
#define __shfl_up_sync(mask, var, laneMask, width) __shfl_up(var, laneMask, width)
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
@@ -189,10 +183,6 @@
#define GCN
#endif // defined(GCN5) || defined(GCN4)
#if defined(__gfx950__)
#define CDNA4
#endif // defined(__gfx950__)
#if defined(__gfx942__)
#define CDNA3
#endif // defined(__gfx942__)
@@ -205,9 +195,9 @@
#define CDNA1
#endif // defined(__gfx908__)
#if defined(CDNA4) || defined(CDNA3) || defined(CDNA2) || defined(CDNA1)
#if defined(CDNA3) || defined(CDNA2) || defined(CDNA1)
#define CDNA // For the entire family
#endif // defined(CDNA4) || defined(CDNA3) || defined(CDNA2) || defined(CDNA1)
#endif // defined(CDNA3) || defined(CDNA2) || defined(CDNA1)
#if defined(__GFX12__)
#define RDNA4

View File

@@ -1,56 +0,0 @@
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
// This is a "staging" header for new ggml API
// It is not publicly available and it should not be used by 3rd party projects
//
// When the API matures enough, it will be moved to the official public API
//
// Meta backend
//
#define GGML_BACKEND_META_MAX_DEVICES 16
enum ggml_backend_meta_split_axis {
// tensor split by tensor dimensions:
GGML_BACKEND_SPLIT_AXIS_0 = 0,
GGML_BACKEND_SPLIT_AXIS_1 = 1,
GGML_BACKEND_SPLIT_AXIS_2 = 2,
GGML_BACKEND_SPLIT_AXIS_3 = 3,
GGML_BACKEND_SPLIT_AXIS_MIRRORED = 10, // all values on all backends
GGML_BACKEND_SPLIT_AXIS_PARTIAL = 11, // each backend has a partial sum
// for internal bookkeeping only:
GGML_BACKEND_SPLIT_AXIS_NONE = 98,
GGML_BACKEND_SPLIT_AXIS_UNKNOWN = 99,
};
GGML_API const char * ggml_backend_meta_split_axis_name(enum ggml_backend_meta_split_axis split_axis);
struct ggml_backend_meta_split_state {
enum ggml_backend_meta_split_axis axis;
// for tensors with axis >= 0 && axis < GGML_MAX_DIMS:
// - each device has a slice of the tensor along the split axis
// - most tensors have n_segments == 1 and a contiguous slice of the tensor data
// - some tensors have an inhomogenenous data layout along the split axis,
// those tensors are divided into segments which are each individually split across devices
// - ne has one entry per segment and device that add up to ggml_tensor::ne for that axis,
// the outer/inner loops are over segments/devices like [seg0_dev0, seg0_dev1, seg1_dev0, seg1_dev1],
// - for example, a transformer may have a fused QKV matrix rather than 3 matrices, those would be 3 separate segments
// that each need to be split individually across devices so that each device gets a slice of Q, K, and V
int64_t ne[16*GGML_BACKEND_META_MAX_DEVICES];
uint32_t n_segments;
};
// function to assign split states for statically allocated tensors, compute tensor split states will be assigned to be compatible:
typedef struct ggml_backend_meta_split_state(*ggml_backend_meta_get_split_state_t)(const struct ggml_tensor * tensor, void * userdata);
// create a new meta device from "simple" devices, meta buffer type/buffer/backend is then derived from this:
// TODO: this looks a bit strange - a backend API creates a device. I think we should try
// express this as a backend registry functionality instead
GGML_API ggml_backend_dev_t ggml_backend_meta_device(
ggml_backend_dev_t * devs, size_t n_devs, ggml_backend_meta_get_split_state_t get_split_state, void * get_split_state_ud);

File diff suppressed because it is too large Load Diff

View File

@@ -14,42 +14,59 @@
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "htp-ctx.h"
#include "htp-ops.h"
#include "htp-msg.h"
#include "htp-ops.h"
#define htp_act_preamble \
const struct htp_tensor * src0 = actx->octx->src[0]; \
const struct htp_tensor * src1 = actx->octx->src[1]; \
const struct htp_tensor * dst = actx->octx->dst; \
\
const uint32_t ne00 = src0->ne[0]; \
const uint32_t ne01 = src0->ne[1]; \
const uint32_t ne02 = src0->ne[2]; \
const uint32_t ne03 = src0->ne[3]; \
\
const uint32_t nb00 = src0->nb[0]; \
const uint32_t nb01 = src0->nb[1]; \
const uint32_t nb02 = src0->nb[2]; \
const uint32_t nb03 = src0->nb[3]; \
\
const uint32_t ne10 = src1 ? src1->ne[0] : 0; \
const uint32_t ne11 = src1 ? src1->ne[1] : 0; \
const uint32_t ne12 = src1 ? src1->ne[2] : 0; \
const uint32_t ne13 = src1 ? src1->ne[3] : 0; \
\
const uint32_t nb10 = src1 ? src1->nb[0] : 0; \
const uint32_t nb11 = src1 ? src1->nb[1] : 0; \
const uint32_t nb12 = src1 ? src1->nb[2] : 0; \
const uint32_t nb13 = src1 ? src1->nb[3] : 0; \
\
const uint32_t ne0 = dst->ne[0]; \
const uint32_t ne1 = dst->ne[1]; \
const uint32_t ne2 = dst->ne[2]; \
const uint32_t ne3 = dst->ne[3]; \
\
const uint32_t nb0 = dst->nb[0]; \
const uint32_t nb1 = dst->nb[1]; \
const uint32_t nb2 = dst->nb[2]; \
#define htp_act_preamble3 \
const uint32_t ne00 = src0->ne[0]; \
const uint32_t ne01 = src0->ne[1]; \
const uint32_t ne02 = src0->ne[2]; \
const uint32_t ne03 = src0->ne[3]; \
\
const uint32_t ne10 = src1->ne[0]; \
const uint32_t ne11 = src1->ne[1]; \
const uint32_t ne12 = src1->ne[2]; \
const uint32_t ne13 = src1->ne[3]; \
\
const uint32_t ne0 = dst->ne[0]; \
const uint32_t ne1 = dst->ne[1]; \
const uint32_t ne2 = dst->ne[2]; \
const uint32_t ne3 = dst->ne[3]; \
\
const uint32_t nb00 = src0->nb[0]; \
const uint32_t nb01 = src0->nb[1]; \
const uint32_t nb02 = src0->nb[2]; \
const uint32_t nb03 = src0->nb[3]; \
\
const uint32_t nb10 = src1->nb[0]; \
const uint32_t nb11 = src1->nb[1]; \
const uint32_t nb12 = src1->nb[2]; \
const uint32_t nb13 = src1->nb[3]; \
\
const uint32_t nb0 = dst->nb[0]; \
const uint32_t nb1 = dst->nb[1]; \
const uint32_t nb2 = dst->nb[2]; \
const uint32_t nb3 = dst->nb[3];
#define htp_act_preamble2 \
const uint32_t ne00 = src0->ne[0]; \
const uint32_t ne01 = src0->ne[1]; \
const uint32_t ne02 = src0->ne[2]; \
const uint32_t ne03 = src0->ne[3]; \
\
const uint32_t ne0 = dst->ne[0]; \
const uint32_t ne1 = dst->ne[1]; \
const uint32_t ne2 = dst->ne[2]; \
const uint32_t ne3 = dst->ne[3]; \
\
const uint32_t nb00 = src0->nb[0]; \
const uint32_t nb01 = src0->nb[1]; \
const uint32_t nb02 = src0->nb[2]; \
const uint32_t nb03 = src0->nb[3]; \
\
const uint32_t nb0 = dst->nb[0]; \
const uint32_t nb1 = dst->nb[1]; \
const uint32_t nb2 = dst->nb[2]; \
const uint32_t nb3 = dst->nb[3];
struct htp_act_context {
@@ -80,7 +97,10 @@ struct htp_act_context {
static void glu_swiglu_f32_per_thread(unsigned int nth, unsigned int ith, void * data) {
struct htp_act_context * actx = (struct htp_act_context *) data;
htp_act_preamble;
const struct htp_tensor * src0 = &actx->octx->src0;
const struct htp_tensor * src1 = &actx->octx->src1;
const struct htp_tensor * dst = &actx->octx->dst;
htp_act_preamble3;
size_t src0_row_size = actx->src0_row_size;
size_t src1_row_size = actx->src1_row_size;
@@ -187,7 +207,10 @@ static void glu_swiglu_f32_per_thread(unsigned int nth, unsigned int ith, void *
static void glu_swiglu_oai_f32_per_thread(unsigned int nth, unsigned int ith, void * data) {
struct htp_act_context * actx = (struct htp_act_context *) data;
htp_act_preamble;
const struct htp_tensor * src0 = &actx->octx->src0;
const struct htp_tensor * src1 = &actx->octx->src1;
const struct htp_tensor * dst = &actx->octx->dst;
htp_act_preamble3;
uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();
@@ -309,7 +332,9 @@ static void glu_swiglu_oai_f32_per_thread(unsigned int nth, unsigned int ith, vo
static void unary_gelu_f32_per_thread(unsigned int nth, unsigned int ith, void * data) {
struct htp_act_context * actx = (struct htp_act_context *) data;
htp_act_preamble;
const struct htp_tensor * src0 = &actx->octx->src0;
const struct htp_tensor * dst = &actx->octx->dst;
htp_act_preamble2;
uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();
@@ -408,7 +433,9 @@ static void unary_gelu_f32_per_thread(unsigned int nth, unsigned int ith, void *
static void unary_silu_f32_per_thread(unsigned int nth, unsigned int ith, void * data) {
struct htp_act_context * actx = (struct htp_act_context *) data;
htp_act_preamble;
const struct htp_tensor * src0 = &actx->octx->src0;
const struct htp_tensor * dst = &actx->octx->dst;
htp_act_preamble2;
uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();
@@ -506,7 +533,10 @@ static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
static void glu_geglu_f32_per_thread(unsigned int nth, unsigned int ith, void * data) {
struct htp_act_context * actx = (struct htp_act_context *) data;
htp_act_preamble;
const struct htp_tensor * src0 = &actx->octx->src0;
const struct htp_tensor * src1 = &actx->octx->src1;
const struct htp_tensor * dst = &actx->octx->dst;
htp_act_preamble3;
size_t src0_row_size = actx->src0_row_size;
size_t src1_row_size = actx->src1_row_size;
@@ -622,9 +652,9 @@ static void glu_geglu_f32_per_thread(unsigned int nth, unsigned int ith, void *
}
static int execute_op_activations_f32(struct htp_ops_context * octx) {
const struct htp_tensor * src0 = octx->src[0];
const struct htp_tensor * src1 = octx->src[1];
const struct htp_tensor * dst = octx->dst;
const struct htp_tensor * src0 = &octx->src0;
const struct htp_tensor * src1 = &octx->src1;
struct htp_tensor * dst = &octx->dst;
if (((src0->ne[0] * SIZEOF_FP32) != src0->nb[1]) || ((dst->ne[0] * SIZEOF_FP32) != dst->nb[1])) {
FARF(ERROR, "Non-contiguous tensors are not supported at this time \n");
@@ -667,20 +697,25 @@ static int execute_op_activations_f32(struct htp_ops_context * octx) {
const uint32_t n_threads = MIN(octx->n_threads, src0_nrows);
size_t src0_row_size = src0->nb[1];
size_t src1_row_size = src1 ? src1->nb[1] : src0->nb[1];
size_t src1_row_size = src1->nb[1]; // zero bytes if src1 is not used
size_t dst_row_size = dst->nb[1];
const bool src1_valid = src1->ne[0];
if (!src1_valid) {
src1_row_size = src0_row_size;
}
const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
const size_t src1_row_size_aligned = hex_round_up(src1_row_size, VLEN);
const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN);
// VTCM scratchpads for all tensors
// N rows per thread, padded to HVX vector size
size_t spad_size_per_row = (src0_row_size_aligned + src1_row_size_aligned) + dst_row_size_aligned;
size_t vtcm_row_per_thread = (octx->ctx->vtcm_size)/ (n_threads* spad_size_per_row);
// Make sure the reserved vtcm size is sufficient
if (vtcm_row_per_thread == 0) {
if(vtcm_row_per_thread ==0){
FARF(ERROR, "act-%s : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n", op_type, octx->ctx->vtcm_size,
spad_size_per_row * n_threads);
return HTP_STATUS_VTCM_TOO_SMALL;
@@ -698,11 +733,7 @@ static int execute_op_activations_f32(struct htp_ops_context * octx) {
octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size;
octx->src0_spad.src = NULL;
octx->src1_spad.src = NULL;
octx->dst_spad.src = NULL;
if (src1) {
if (src1->ne[0]) {
FARF(HIGH, "%s: %ux%ux%ux%u x %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], octx->src0_spad.size, octx->src1_spad.size,
@@ -742,9 +773,9 @@ static int execute_op_activations_f32(struct htp_ops_context * octx) {
// Pointers and GLU logic
const uint8_t * data_src0 = (const uint8_t *) src0->data;
const uint8_t * data_src1 = src1 ? (const uint8_t *) src1->data : NULL;
const uint8_t * data_src1 = (const uint8_t *) src1->data;
if (!src1 && (octx->op == HTP_OP_GLU_SWIGLU || octx->op == HTP_OP_GLU_SWIGLU_OAI || octx->op == HTP_OP_GLU_GEGLU)) {
if (!src1_valid && (octx->op == HTP_OP_GLU_SWIGLU || octx->op == HTP_OP_GLU_SWIGLU_OAI || octx->op == HTP_OP_GLU_GEGLU)) {
const int32_t swapped = octx->op_params[1];
data_src1 = data_src0;
actx.src1_row_size = actx.src0_row_size;
@@ -768,7 +799,7 @@ static int execute_op_activations_f32(struct htp_ops_context * octx) {
int op_activations(struct htp_ops_context * octx) {
int err = HTP_STATUS_OK;
switch (octx->src[0]->type) {
switch (octx->src0.type) {
case HTP_TYPE_F32:
err = execute_op_activations_f32(octx);
break;

View File

@@ -12,7 +12,7 @@
#include "hex-dma.h"
#include "htp-ctx.h"
#include "htp-ops.h"
#include "htp-msg.h"
#include "htp-ops.h"
#ifndef MIN
@@ -175,8 +175,8 @@ static void htp_argsort_f32(unsigned int n, unsigned int i, void * data) {
struct htp_ops_context * octx = actx->octx;
// Unpack context
const struct htp_tensor * src0 = octx->src[0];
const struct htp_tensor * dst = octx->dst;
const struct htp_tensor * src0 = &octx->src0;
const struct htp_tensor * dst = &octx->dst;
// Scratchpad memory
uint8_t * spad = octx->src0_spad.data + octx->src0_spad.size_per_thread * i;
@@ -249,16 +249,16 @@ static void htp_argsort_f32(unsigned int n, unsigned int i, void * data) {
int op_argsort(struct htp_ops_context * octx) {
// Check supported types
if (octx->src[0]->type != HTP_TYPE_F32) {
if (octx->src0.type != HTP_TYPE_F32) {
return HTP_STATUS_NO_SUPPORT;
}
const uint32_t total_rows = octx->src[0]->ne[1] * octx->src[0]->ne[2] * octx->src[0]->ne[3];
const uint32_t total_rows = octx->src0.ne[1] * octx->src0.ne[2] * octx->src0.ne[3];
const uint32_t n_threads = MIN(total_rows, octx->n_threads);
// Allocate scratchpad
// We need 1 row of float + 1 row of int32 per thread.
uint32_t ne00 = octx->src[0]->ne[0];
uint32_t ne00 = octx->src0.ne[0];
size_t values_size = hex_round_up(ne00 * sizeof(float), 128);
size_t indices_size = hex_round_up(ne00 * sizeof(int32_t), 128);
size_t spad_per_thread = values_size + indices_size;
@@ -278,9 +278,9 @@ int op_argsort(struct htp_ops_context * octx) {
octx->src0_spad.size_per_thread = spad_per_thread;
FARF(HIGH, "argsort: %ux%ux%ux%u -> %ux%ux%ux%u (0x%x, 0x%x)",
octx->src[0]->ne[0], octx->src[0]->ne[1], octx->src[0]->ne[2], octx->src[0]->ne[3],
octx->dst->ne[0], octx->dst->ne[1], octx->dst->ne[2], octx->dst->ne[3],
octx->src[0]->data, octx->dst->data);
octx->src0.ne[0], octx->src0.ne[1], octx->src0.ne[2], octx->src0.ne[3],
octx->dst.ne[0], octx->dst.ne[1], octx->dst.ne[2], octx->dst.ne[3],
octx->src0.data, octx->dst.data);
struct htp_argsort_context actx;
actx.octx = octx;

View File

@@ -14,7 +14,7 @@
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "htp-ctx.h"
#include "htp-ops.h"
#include "htp-msg.h"
#include "htp-ops.h"
#ifndef MIN
@@ -43,10 +43,10 @@ struct htp_binary_context {
bool split_at_ne02;
};
#define htp_binary_preamble \
const struct htp_tensor * src0 = octx->src[0]; \
const struct htp_tensor * src1 = octx->src[1]; \
const struct htp_tensor * dst = octx->dst; \
#define htp_binary_preamble \
const struct htp_tensor * src0 = &octx->src0; \
const struct htp_tensor * src1 = &octx->src1; \
struct htp_tensor * dst = &octx->dst; \
\
const uint32_t ne00 = src0->ne[0]; \
const uint32_t ne01 = src0->ne[1]; \
@@ -181,7 +181,7 @@ static void binary_job_scalar(unsigned int nth, unsigned int ith, void * data) {
struct htp_ops_context * octx = bctx->octx;
htp_binary_preamble;
const uint32_t src0_type = octx->src[0]->type;
const uint32_t src0_type = octx->src0.type;
const uint32_t row_size_bytes = (src0_type == HTP_TYPE_F32) ? ne00 * sizeof(float) : ne00 * sizeof(_Float16);
const uint32_t total_rows = ne01 * ne02 * ne03;
const uint32_t start_row = bctx->nrows_per_thread * ith;
@@ -274,7 +274,7 @@ static void binary_job_vector_same_shape(unsigned int nth, unsigned int ith, voi
struct htp_ops_context * octx = bctx->octx;
htp_binary_preamble;
const uint32_t src0_type = octx->src[0]->type;
const uint32_t src0_type = octx->src0.type;
const uint32_t row_size_bytes = (src0_type == HTP_TYPE_F32) ? ne00 * sizeof(float) : ne00 * sizeof(_Float16);
const uint32_t total_rows = ne01 * ne02 * ne03;
const uint32_t start_row = bctx->nrows_per_thread * ith;
@@ -374,7 +374,7 @@ static void binary_job_vector_row_broadcast(unsigned int nth, unsigned int ith,
struct htp_ops_context * octx = bctx->octx;
htp_binary_preamble;
const uint32_t src0_type = octx->src[0]->type;
const uint32_t src0_type = octx->src0.type;
const uint32_t row_size_bytes = (src0_type == HTP_TYPE_F32) ? ne00 * sizeof(float) : ne00 * sizeof(_Float16);
const uint32_t total_rows = ne01 * ne02 * ne03;
const uint32_t start_row = bctx->nrows_per_thread * ith;
@@ -455,7 +455,7 @@ static void binary_job_vector_complex(unsigned int nth, unsigned int ith, void *
struct htp_ops_context * octx = bctx->octx;
htp_binary_preamble;
const uint32_t src0_type = octx->src[0]->type;
const uint32_t src0_type = octx->src0.type;
const uint32_t row_size_bytes = (src0_type == HTP_TYPE_F32) ? ne00 * sizeof(float) : ne00 * sizeof(_Float16);
const uint32_t total_rows = ne01 * ne02 * ne03;
const uint32_t start_row = bctx->nrows_per_thread * ith;
@@ -540,7 +540,7 @@ static void binary_job_element_repeat(unsigned int nth, unsigned int ith, void *
struct htp_ops_context * octx = bctx->octx;
htp_binary_preamble;
const uint32_t src0_type = octx->src[0]->type;
const uint32_t src0_type = octx->src0.type;
const uint32_t elem_size_bytes = (src0_type == HTP_TYPE_F32) ? sizeof(float) : sizeof(_Float16);
const uint32_t row_size_bytes = ne00 * elem_size_bytes;;
const uint32_t total_rows = ne01 * ne02 * ne03;
@@ -629,10 +629,10 @@ static void binary_job_add_id(unsigned int nth, unsigned int ith, void * data) {
struct htp_binary_context * bctx = (struct htp_binary_context *) data;
struct htp_ops_context * octx = bctx->octx;
const struct htp_tensor * src0 = octx->src[0];
const struct htp_tensor * src1 = octx->src[1];
const struct htp_tensor * src2 = octx->src[2];
const struct htp_tensor * dst = octx->dst;
const struct htp_tensor * src0 = &octx->src0;
const struct htp_tensor * src1 = &octx->src1;
const struct htp_tensor * src2 = &octx->src2;
struct htp_tensor * dst = &octx->dst;
const uint32_t ne00 = src0->ne[0];
const uint32_t ne01 = src0->ne[1];
@@ -723,15 +723,15 @@ static void binary_job_add_id(unsigned int nth, unsigned int ith, void * data) {
}
static int execute_op_binary(struct htp_ops_context * octx) {
const struct htp_tensor * src0 = octx->src[0];
const struct htp_tensor * src1 = octx->src[1];
const struct htp_tensor * dst = octx->dst;
const struct htp_tensor * src0 = &octx->src0;
const struct htp_tensor * src1 = &octx->src1;
struct htp_tensor * dst = &octx->dst;
const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
const uint32_t n_threads = MIN(octx->n_threads, src0_nrows);
// Use packed row sizes for VTCM allocation
const uint32_t src0_type = octx->src[0]->type;
const uint32_t src0_type = octx->src0.type;
const size_t elem_size = (src0_type == HTP_TYPE_F32) ? sizeof(float) : sizeof(_Float16);
const size_t src0_row_size = src0->ne[0] * elem_size;
const size_t src1_row_size = src1->ne[0] * elem_size;
@@ -799,9 +799,9 @@ static int execute_op_binary(struct htp_ops_context * octx) {
return HTP_STATUS_VTCM_TOO_SMALL;
}
octx->src0_spad.data = octx->ctx->vtcm_base; octx->src0_spad.src = NULL;
octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size; octx->src1_spad.src = NULL;
octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size; octx->dst_spad.src = NULL;
octx->src0_spad.data = octx->ctx->vtcm_base;
octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size;
if ((octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
return HTP_STATUS_OK;
@@ -857,12 +857,12 @@ static int execute_op_binary(struct htp_ops_context * octx) {
int op_binary(struct htp_ops_context * octx) {
// Does not support permutations of src1
const struct htp_tensor * src1 = octx->src[1];
const struct htp_tensor * src1 = &octx->src1;
if (src1->nb[1] < src1->nb[0]) {
return HTP_STATUS_NO_SUPPORT;
}
const uint32_t src0_type = octx->src[0]->type;
const uint32_t src0_type = octx->src0.type;
if ((src0_type == HTP_TYPE_F32) || (src0_type == HTP_TYPE_F16)) {
return execute_op_binary(octx);
}

View File

@@ -11,7 +11,7 @@
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "htp-ctx.h"
#include "htp-ops.h"
#include "htp-msg.h"
#include "htp-ops.h"
#include "hvx-utils.h"
@@ -32,10 +32,10 @@ struct htp_copy_context {
void (*copy)(struct htp_copy_context * ct, struct htp_ops_context * octx, int nth, int ith);
};
#define cpy_preamble \
const struct htp_tensor *src0 = octx->src[0]; \
const struct htp_tensor *dst = octx->dst; \
\
#define cpy_preamble \
struct htp_tensor *src0 = &octx->src0; \
struct htp_tensor *dst = &octx->dst; \
\
const uint32_t ne00 = src0->ne[0]; \
const uint32_t ne01 = src0->ne[1]; \
const uint32_t ne02 = src0->ne[2]; \

View File

@@ -13,9 +13,9 @@
#include "hvx-utils.h"
#include "hex-dma.h"
#define htp_cumsum_tensors_preamble \
const struct htp_tensor * restrict src0 = octx->src[0]; \
const struct htp_tensor * restrict dst = octx->dst; \
#define htp_cumsum_tensors_preamble \
struct htp_tensor * restrict src0 = &octx->src0; \
struct htp_tensor * restrict dst = &octx->dst; \
\
const uint32_t ne00 = src0->ne[0]; \
const uint32_t ne01 = src0->ne[1]; \
@@ -206,8 +206,8 @@ static void cumsum_thread_f32(unsigned int nth, unsigned int ith, void * data) {
}
int op_cumsum_f32(struct htp_ops_context * octx) {
const struct htp_tensor * src0 = octx->src[0];
const struct htp_tensor * dst = octx->dst;
const struct htp_tensor * src0 = &octx->src0;
const struct htp_tensor * dst = &octx->dst;
if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
return HTP_STATUS_OK;
@@ -226,12 +226,10 @@ int op_cumsum_f32(struct htp_ops_context * octx) {
octx->src0_spad.size_per_thread = src_row_size_aligned * 2;
octx->dst_spad.size_per_thread = dst_row_size_aligned * 2;
octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread;
octx->dst_spad.size = n_threads * octx->dst_spad.size_per_thread;
octx->src0_spad.data = octx->ctx->vtcm_base; octx->src0_spad.src = NULL;
octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size; octx->dst_spad.src = NULL;
octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread;
octx->dst_spad.size = n_threads * octx->dst_spad.size_per_thread;
octx->src0_spad.data = octx->ctx->vtcm_base;
octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size;
struct htp_cumsum_context cctx = {
.octx = octx,
@@ -253,9 +251,8 @@ int op_cumsum_f32(struct htp_ops_context * octx) {
}
int op_cumsum(struct htp_ops_context * octx) {
const struct htp_tensor * dst = octx->dst;
int err = HTP_STATUS_OK;
int err = HTP_STATUS_OK;
struct htp_tensor * dst = &octx->dst;
switch (dst->type) {
case HTP_TYPE_F32:

View File

@@ -15,7 +15,7 @@
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "htp-ctx.h"
#include "htp-ops.h"
#include "htp-msg.h"
#include "htp-ops.h"
// Must be multiple of 32
@@ -278,12 +278,12 @@ static inline void hvx_scale_vec_f32_aa(uint8_t * restrict dst, const uint8_t *
static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void * data) {
struct htp_fa_context * factx = (struct htp_fa_context *) data;
const struct htp_ops_context * octx = factx->octx;
const struct htp_tensor * q = octx->src[0];
const struct htp_tensor * k = octx->src[1];
const struct htp_tensor * v = octx->src[2];
const struct htp_tensor * mask = octx->src[3];
const struct htp_tensor * sinks = octx->src[4];
const struct htp_tensor * dst = octx->dst;
const struct htp_tensor * q = &octx->src0;
const struct htp_tensor * k = &octx->src1;
const struct htp_tensor * v = &octx->src2;
const struct htp_tensor * mask = (octx->src3.data) ? &octx->src3 : NULL;
const struct htp_tensor * sinks = (octx->src4.data) ? &octx->src4 : NULL;
const struct htp_tensor * dst = &octx->dst;
const uint32_t neq0 = q->ne[0];
const uint32_t neq1 = q->ne[1];
@@ -610,11 +610,11 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void *
}
int op_flash_attn_ext(struct htp_ops_context * octx) {
const struct htp_tensor * q = octx->src[0];
const struct htp_tensor * k = octx->src[1];
const struct htp_tensor * v = octx->src[2];
const struct htp_tensor * mask = octx->src[3];
const struct htp_tensor * dst = octx->dst;
const struct htp_tensor * q = &octx->src0;
const struct htp_tensor * k = &octx->src1;
const struct htp_tensor * v = &octx->src2;
const struct htp_tensor * mask = (octx->src3.data) ? &octx->src3 : NULL;
const struct htp_tensor * dst = &octx->dst;
// Check support
if ((q->type != HTP_TYPE_F16 && q->type != HTP_TYPE_F32) || k->type != HTP_TYPE_F16 || v->type != HTP_TYPE_F16) {
@@ -701,11 +701,13 @@ int op_flash_attn_ext(struct htp_ops_context * octx) {
return HTP_STATUS_VTCM_TOO_SMALL;
}
octx->src0_spad.data = octx->ctx->vtcm_base; octx->src0_spad.src = NULL;
octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size; octx->src1_spad.src = NULL;
octx->src2_spad.data = octx->src1_spad.data + octx->src1_spad.size; octx->src2_spad.src = NULL;
octx->src3_spad.data = octx->src2_spad.data + octx->src2_spad.size; octx->src3_spad.src = NULL;
octx->dst_spad.data = octx->src3_spad.data + octx->src3_spad.size; octx->dst_spad.src = NULL;
octx->src0_spad.data = octx->ctx->vtcm_base;
octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
octx->src2_spad.data = octx->src1_spad.data + octx->src1_spad.size;
octx->src3_spad.data = octx->src2_spad.data + octx->src2_spad.size;
octx->dst_spad.data = octx->src3_spad.data + octx->src3_spad.size;
// FARF(ERROR, "fa: qrows-per-thread %u", factx.qrows_per_thread);
if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
worker_pool_run_func(octx->ctx->worker_pool, flash_attn_ext_f16_thread, &factx, octx->n_threads);

View File

@@ -11,7 +11,7 @@
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "htp-ctx.h"
#include "htp-ops.h"
#include "htp-msg.h"
#include "htp-ops.h"
#include "hvx-utils.h"
@@ -23,33 +23,27 @@ struct get_rows_context {
};
#define get_rows_preamble \
const uint32_t ne00 = octx->src[0]->ne[0]; \
const uint32_t ne01 = octx->src[0]->ne[1]; \
const uint32_t ne02 = octx->src[0]->ne[2]; \
const uint32_t ne03 = octx->src[0]->ne[3]; \
\
const uint32_t ne10 = octx->src[1]->ne[0]; \
const uint32_t ne11 = octx->src[1]->ne[1]; \
const uint32_t ne12 = octx->src[1]->ne[2]; \
const uint32_t ne13 = octx->src[1]->ne[3]; \
\
const uint32_t ne0 = octx->dst->ne[0]; \
const uint32_t ne1 = octx->dst->ne[1]; \
const uint32_t ne2 = octx->dst->ne[2]; \
const uint32_t ne3 = octx->dst->ne[3]; \
\
const uint32_t nb01 = octx->src[0]->nb[1]; \
const uint32_t nb02 = octx->src[0]->nb[2]; \
const uint32_t nb03 = octx->src[0]->nb[3]; \
\
const uint32_t nb10 = octx->src[1]->nb[0]; \
const uint32_t nb11 = octx->src[1]->nb[1]; \
const uint32_t nb12 = octx->src[1]->nb[2]; \
\
const uint32_t nb1 = octx->dst->nb[1]; \
const uint32_t nb2 = octx->dst->nb[2]; \
const uint32_t nb3 = octx->dst->nb[3]; \
\
const uint32_t ne00 = octx->src0.ne[0]; \
const uint32_t ne01 = octx->src0.ne[1]; \
const uint32_t ne02 = octx->src0.ne[2]; \
const uint32_t ne03 = octx->src0.ne[3]; \
\
const uint32_t ne10 = octx->src1.ne[0]; \
const uint32_t ne11 = octx->src1.ne[1]; \
const uint32_t ne12 = octx->src1.ne[2]; \
\
const uint32_t nb01 = octx->src0.nb[1]; \
const uint32_t nb02 = octx->src0.nb[2]; \
const uint32_t nb03 = octx->src0.nb[3]; \
\
const uint32_t nb10 = octx->src1.nb[0]; \
const uint32_t nb11 = octx->src1.nb[1]; \
const uint32_t nb12 = octx->src1.nb[2]; \
\
const uint32_t nb1 = octx->dst.nb[1]; \
const uint32_t nb2 = octx->dst.nb[2]; \
const uint32_t nb3 = octx->dst.nb[3]; \
\
const uint32_t nr = ne10 * ne11 * ne12;
static void get_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *data) {
@@ -57,14 +51,12 @@ static void get_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *da
struct htp_ops_context * octx = grctx->octx;
get_rows_preamble;
uint64_t qt = HAP_perf_get_qtimer_count();
// parallelize by src1 elements (which correspond to dst rows)
const uint32_t dr = grctx->src1_nrows_per_thread;
const uint32_t ir0 = dr * ith;
const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr;
const bool is_i32 = (octx->src[1]->type == HTP_TYPE_I32);
const bool is_i32 = (octx->src1.type == HTP_TYPE_I32);
for (uint32_t i = ir0; i < ir1; ++i) {
const uint32_t i12 = fastdiv(i, &grctx->get_rows_div_ne10_ne11);
@@ -72,7 +64,7 @@ static void get_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *da
const uint32_t i11 = fastdiv(rem, &grctx->get_rows_div_ne10);
const uint32_t i10 = rem - i11 * ne10;
const uintptr_t src1_addr = octx->src[1]->data + i10*nb10 + i11*nb11 + i12*nb12;
const uintptr_t src1_addr = octx->src1.data + i10*nb10 + i11*nb11 + i12*nb12;
uint32_t i01 = is_i32 ? *(int32_t *)src1_addr : *(int64_t *)src1_addr;
@@ -81,14 +73,10 @@ static void get_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *da
continue;
}
const uintptr_t src0_ptr = octx->src[0]->data + i01*nb01 + i11*nb02 + i12*nb03;
const uintptr_t dst_ptr = octx->dst->data + i10*nb1 + i11*nb2 + i12*nb3;
const uintptr_t src0_ptr = octx->src0.data + i01*nb01 + i11*nb02 + i12*nb03;
const uintptr_t dst_ptr = octx->dst.data + i10*nb1 + i11*nb2 + i12*nb3;
hvx_copy_f32_uu((uint8_t *)dst_ptr, (const uint8_t *)src0_ptr, ne00);
}
qt = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - qt);
FARF(HIGH, "get-rows-f32-f32 %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
ne00, ne01, ne02, ne03, ir0, ir1, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, (unsigned) qt);
}
int op_get_rows(struct htp_ops_context * octx) {
@@ -96,15 +84,15 @@ int op_get_rows(struct htp_ops_context * octx) {
const uint32_t n_threads = MIN(nr, octx->n_threads);
if (octx->src[0]->type != HTP_TYPE_F32) {
if (octx->src0.type != HTP_TYPE_F32) {
return HTP_STATUS_NO_SUPPORT;
}
if (octx->dst->type != HTP_TYPE_F32) {
if (octx->dst.type != HTP_TYPE_F32) {
return HTP_STATUS_NO_SUPPORT;
}
if (octx->src[1]->type != HTP_TYPE_I32 && octx->src[1]->type != HTP_TYPE_I64) {
if (octx->src1.type != HTP_TYPE_I32 && octx->src1.type != HTP_TYPE_I64) {
return HTP_STATUS_NO_SUPPORT;
}
@@ -114,8 +102,8 @@ int op_get_rows(struct htp_ops_context * octx) {
struct get_rows_context grctx;
grctx.octx = octx;
grctx.get_rows_div_ne10 = init_fastdiv_values(octx->src[1]->ne[0]);
grctx.get_rows_div_ne10_ne11 = init_fastdiv_values(octx->src[1]->ne[0] * octx->src[1]->ne[1]);
grctx.get_rows_div_ne10 = init_fastdiv_values(octx->src1.ne[0]);
grctx.get_rows_div_ne10_ne11 = init_fastdiv_values(octx->src1.ne[0] * octx->src1.ne[1]);
grctx.src1_nrows_per_thread = (nr + n_threads - 1) / n_threads;

View File

@@ -3,10 +3,8 @@
#include <stdbool.h>
#include <stdint.h>
#include <qurt_memory.h>
#include "hexagon_types.h"
#include "hexagon_protos.h"
#include "hex-fastdiv.h"
#include "hex-dump.h"
@@ -70,23 +68,4 @@ static inline void hex_l2fetch(const void * p, uint32_t width, uint32_t stride,
Q6_l2fetch_AP((void *) p, control);
}
#define HEX_L2_LINE_SIZE 64
#define HEX_L2_FLUSH_SIZE (128 * 1024)
static inline void hex_l2flush(void * addr, size_t size)
{
if (size > HEX_L2_FLUSH_SIZE) {
qurt_mem_cache_clean((qurt_addr_t) 0, 0, QURT_MEM_CACHE_FLUSH_INVALIDATE_ALL, QURT_MEM_DCACHE);
} else {
const uint32_t s = (uint32_t) addr;
const uint32_t e = s + size;
for (uint32_t i = s; i < e; i += HEX_L2_LINE_SIZE * 4) {
Q6_dccleaninva_A((void *) i + HEX_L2_LINE_SIZE * 0);
Q6_dccleaninva_A((void *) i + HEX_L2_LINE_SIZE * 1);
Q6_dccleaninva_A((void *) i + HEX_L2_LINE_SIZE * 2);
Q6_dccleaninva_A((void *) i + HEX_L2_LINE_SIZE * 3);
}
}
}
#endif /* HEX_UTILS_H */

View File

@@ -20,7 +20,7 @@
#include "hvx-dump.h"
#include "worker-pool.h"
#include "htp-ctx.h"
#include "htp-ops.h"
#include "htp-msg.h"
#include "hmx-utils.h"
#include "hmx-ops.h"
@@ -821,7 +821,7 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu
// and each q_head is computed individually to avoid tile-major packing
// issues. m_chunk_n_rows is always a multiple of 32 (from
// hmx_compute_chunks), so per-head tile arrays don't overlap.
const size_t vtcm_budget = ctx->vtcm_size;
const size_t vtcm_budget = ctx->vtcm_scratch_size;
const size_t vec_dot_size = params->k * sizeof(__fp16);
// When the activation has a large stride (e.g. permuted Q tensor with
@@ -998,7 +998,7 @@ int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx, float *restrict dst, co
}
// --- Dynamic VTCM layout ---
const size_t vtcm_budget = ctx->vtcm_size;
const size_t vtcm_budget = ctx->vtcm_scratch_size;
const size_t vec_dot_size = k * sizeof(__fp16);
// DMA-based activation gather for strided tensors (see batched path comment).
@@ -1182,7 +1182,7 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
FARF(MEDIUM, "hmx_matmul_qk: STANDARD path m=%d k=%d n=%d type=%d", m, k, n, weight_type);
// --- Dynamic VTCM layout ---
const size_t vtcm_budget = ctx->vtcm_size;
const size_t vtcm_budget = ctx->vtcm_scratch_size;
const size_t vec_dot_size = k * sizeof(__fp16);
const bool use_pipeline = (m >= 128) && (k <= n);
@@ -1273,6 +1273,9 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
void *buf_curr = vtcm_scratch0;
void *buf_next = vtcm_scratch1;
// issue async DDR data transfer for the first weight chunk
// NOTE: use 2D DMA (n_cols rows x row_stride bytes) instead of 1D
// because UDMA roiwidth is 16-bit and total size can exceed 65535.
{
const size_t n_cols_first = hex_smin(n, n_chunk_n_cols);
dma_queue_push(ctx->dma[0], dma_make_ptr(buf_curr, permuted_weight), row_stride, row_stride, row_stride, n_cols_first);
@@ -1530,15 +1533,20 @@ void transfer_activation_chunk_threaded(struct htp_context *ctx, __fp16 *dst, co
worker_pool_run_func(ctx->worker_pool, transfer_activation_chunk_worker_fn, &state, ctx->n_threads);
}
int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict out, const float *restrict x, const uint8_t *restrict w,
int m, int k, int n, int weight_type) {
int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict out, const float *restrict x, const uint8_t *restrict w, int m,
int k, int n, int weight_type) {
// Runtime check -- k >= 16384 exceeds 2D DMA limit
if (k >= 16384) {
FARF(HIGH, "%s: k=%d exceeds 2D DMA limit", __func__, k);
return -1;
}
// assume k % 32 == 0 && n % 32 == 0
const size_t row_stride = get_x4x2_row_stride(weight_type, k);
if (row_stride == 0) {
return -1;
}
const size_t vtcm_budget = ctx->vtcm_size;
const size_t vtcm_budget = ctx->vtcm_scratch_size;
const size_t M_BLOCK_SIZE = 512;
const size_t N_BLOCK_SIZE = 512;
@@ -1568,7 +1576,8 @@ int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict
__fp16 *vtcm_scales = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, 256);
assert((size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base) <= vtcm_budget);
FARF(MEDIUM, "%s: m=%d k=%d n=%d wtype=%d vtcm=%zu/%zu", __func__, m, k, n, weight_type,
FARF(MEDIUM, "%s: m=%d k=%d n=%d wtype=%d vtcm=%zu/%zu",
__func__, m, k, n, weight_type,
(size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base), vtcm_budget);
// initialize eye tile (32x32 identity matrix)

View File

@@ -7,12 +7,16 @@
#include <stddef.h>
#include <stdint.h>
#include "htp-ops.h"
#ifndef restrict
# define restrict __restrict
#endif
#ifdef __cplusplus
extern "C" {
#endif
struct htp_context; // forward declaration
typedef struct {
float *dst;
const float *activation;

View File

@@ -2,7 +2,6 @@
#define HTP_CTX_H
#include "hex-dma.h"
#include "htp-ops.h"
#include "worker-pool.h"
#include <assert.h>
@@ -11,85 +10,38 @@
#include <stdint.h>
#define HTP_MAX_NTHREADS 10
#define HTP_MAX_MMAPS 16
// Memory mapping
struct htp_mmap {
uint64_t size;
uint64_t base;
uint32_t fd;
uint32_t pinned;
};
// Scratchpad state
struct htp_spad {
const struct htp_tensor * src; // original src of the data (for reuse)
uint8_t * data; // pointer to an area in vtcm
uint32_t stride; // stride used inside this spad
uint32_t size; // total size
uint32_t size_per_thread; // size per thread
};
// Context while processing an Op
// TODO: fold this into the main context
struct htp_ops_context {
struct htp_context * ctx;
enum htp_op_code op; // FIXME: rename to opcode
int32_t op_params[HTP_OP_MAX_PARAMS];
const struct htp_tensor * src[HTP_OP_MAX_INPUTS];
const struct htp_tensor * dst;
// TODO convert these to an array
struct htp_spad src0_spad;
struct htp_spad src1_spad;
struct htp_spad src2_spad;
struct htp_spad src3_spad;
struct htp_spad dst_spad;
uint32_t n_threads;
uint32_t flags;
};
// Main context for htp DSP backend
struct htp_context {
dspqueue_t queue;
dma_queue * dma[HTP_MAX_NTHREADS];
struct htp_mmap mmap[HTP_MAX_MMAPS];
worker_pool_context_t worker_pool;
uint32_t n_threads;
dspqueue_t queue;
dma_queue * dma[HTP_MAX_NTHREADS];
worker_pool_context_t worker_pool;
uint32_t n_threads;
int thread_id;
int thread_prio;
int thread_id;
int thread_prio;
int hmx_enabled;
uint8_t * vtcm_base;
size_t vtcm_size;
uint32_t vtcm_rctx;
uint8_t * vtcm_base;
size_t vtcm_size;
uint32_t vtcm_rctx;
atomic_bool vtcm_valid;
atomic_bool vtcm_needs_release;
atomic_bool vtcm_valid;
atomic_bool vtcm_inuse;
atomic_bool vtcm_needs_release;
struct htp_ops_context octx;
uint32_t opmask;
// Cached src1 spad position from the last quantize pass.
// When SKIP_QUANTIZE is set the Q8 activation data is already in VTCM
// at this address; the matmul must read from here instead of recomputing
// the offset (which depends on the current op's src0 size).
uint8_t * prev_src1_spad;
// HMX acceleration fields (v73+, enabled by compile-time HTP_HAS_HMX)
#ifdef HTP_HAS_HMX
int hmx_enabled; // Runtime flag: HMX initialisation succeeded
size_t vtcm_scratch_size; // Usable dynamic scratch (vtcm_size minus tail reservation)
#endif
};
int op_matmul(struct htp_ops_context * octx);
int op_matmul_id(struct htp_ops_context * octx);
int op_binary(struct htp_ops_context * octx);
int op_unary(struct htp_ops_context * octx);
int op_sum_rows(struct htp_ops_context * octx);
int op_activations(struct htp_ops_context * octx);
int op_softmax(struct htp_ops_context * octx);
int op_add_id(struct htp_ops_context * octx);
int op_rope(struct htp_ops_context * octx);
int op_flash_attn_ext(struct htp_ops_context * octx);
int op_set_rows(struct htp_ops_context * octx);
int op_get_rows(struct htp_ops_context * octx);
int op_cpy(struct htp_ops_context * octx);
int op_repeat(struct htp_ops_context * octx);
int op_argsort(struct htp_ops_context * octx);
int op_ssm_conv(struct htp_ops_context * octx);
int op_cumsum(struct htp_ops_context * octx);
#endif /* HTP_CTX_H */

View File

@@ -0,0 +1,166 @@
#ifndef HTP_MSG_H
#define HTP_MSG_H
#include <assert.h>
// ggml-common.h must be included prio to this header
// Mask to enable various stages of the Ops.
// Used for debugging and profiling.
enum {
HTP_OPMASK_QUEUE = (1 << 0), // Enable Queueing (ie calls into the DSP)
HTP_OPMASK_QUANTIZE = (1 << 1), // Enable Quantize
HTP_OPMASK_COMPUTE = (1 << 2), // Enable Compute
};
// Op flags
enum {
HTP_OPFLAGS_SKIP_QUANTIZE = (1 << 0), // Skip dynamic quantization (reuse quantized tensors)
HTP_OPFLAGS_SKIP_COMPUTE = (1 << 1), // Skip actual computation (used for profiling)
HTP_OPFLAGS_EARLY_WAKEUP = (1 << 2) // Send early wakeup notification
};
enum htp_status {
HTP_STATUS_OK = 1,
HTP_STATUS_INTERNAL_ERR = 2,
HTP_STATUS_NO_SUPPORT = 3,
HTP_STATUS_INVAL_PARAMS = 4,
HTP_STATUS_VTCM_TOO_SMALL = 5,
};
// The values must match the ggml_type.
// Duplicated here because we can't include full ggml.h in the htp build.
// We have some static_asserts in the cpp code to ensure things are in sync.
enum htp_data_type {
HTP_TYPE_F32 = 0,
HTP_TYPE_F16 = 1,
HTP_TYPE_Q4_0 = 2,
HTP_TYPE_Q8_0 = 8,
HTP_TYPE_IQ4_NL = 20,
HTP_TYPE_I32 = 26,
HTP_TYPE_I64 = 27,
HTP_TYPE_MXFP4 = 39,
HTP_TYPE_COUNT
};
// Do not reorder first 4 (used as an index)
enum htp_op {
HTP_OP_MUL = 0,
HTP_OP_ADD = 1,
HTP_OP_SUB = 2,
HTP_OP_DIV = 3,
HTP_OP_MUL_MAT,
HTP_OP_MUL_MAT_ID,
HTP_OP_RMS_NORM,
HTP_OP_UNARY_SILU,
HTP_OP_UNARY_GELU,
HTP_OP_UNARY_SIGMOID,
HTP_OP_UNARY_EXP,
HTP_OP_UNARY_NEG,
HTP_OP_UNARY_SOFTPLUS,
HTP_OP_GLU_SWIGLU,
HTP_OP_GLU_SWIGLU_OAI,
HTP_OP_GLU_GEGLU,
HTP_OP_SOFTMAX,
HTP_OP_ADD_ID,
HTP_OP_ROPE,
HTP_OP_FLASH_ATTN_EXT,
HTP_OP_SET_ROWS,
HTP_OP_GET_ROWS,
HTP_OP_SCALE,
HTP_OP_CPY,
HTP_OP_ARGSORT,
HTP_OP_SQR,
HTP_OP_SQRT,
HTP_OP_SUM_ROWS,
HTP_OP_SSM_CONV,
HTP_OP_REPEAT,
HTP_OP_CUMSUM,
INVALID
};
static inline size_t htp_t_block_size(uint32_t t) {
switch (t) {
case HTP_TYPE_F32:
return 1;
case HTP_TYPE_F16:
return 1;
case HTP_TYPE_Q4_0:
return QK4_0;
case HTP_TYPE_Q8_0:
return QK8_0;
case HTP_TYPE_IQ4_NL:
return QK4_NL;
case HTP_TYPE_MXFP4:
return QK_MXFP4;
default:
assert(0 && "unsupported HTP data type");
}
return 0;
}
static inline size_t htp_type_nbytes(uint32_t t) {
switch (t) {
case HTP_TYPE_F32:
return 4;
case HTP_TYPE_F16:
return 2;
case HTP_TYPE_Q4_0:
return sizeof(block_q4_0);
case HTP_TYPE_Q8_0:
return sizeof(block_q8_0);
case HTP_TYPE_IQ4_NL:
return sizeof(block_iq4_nl);
case HTP_TYPE_MXFP4:
return sizeof(block_mxfp4);
default:
assert(0 && "unsupported HTP data type");
}
return 0;
}
// Internal types
#define QK_Q4_0x4x2 256 // 4x Q4_0 blocks packed with next 4x Q4_0 blocks (size in bytes 128)
#define QK_Q8_0x4x2 256 // 4x Q8_0 blocks concat with next 4x Q8_0 blocks
#define QK_MXFP4x4x2 256 // 4x MXFP4 blocks concat with next 4x MXFP4 blocks
#define HTP_MAX_DIMS 4
struct htp_tensor {
uint32_t data; // Buffer offset in the messages, and data pointer on the NSP
uint32_t type; // Data type
uint32_t ne[HTP_MAX_DIMS]; // Number of elements
uint32_t nb[HTP_MAX_DIMS]; // Stride in bytes (see ggml.h ggml_tensor)
};
#define HTP_MAX_OP_PARAMS 64
struct htp_general_req {
uint32_t op; // GGML/HTP Op
int32_t op_params[HTP_MAX_OP_PARAMS / sizeof(int32_t)];
// Params for the op, e.g. epsilon of RMS norm
uint32_t flags; // Request flags
struct htp_tensor src0; // Input0 tensor
struct htp_tensor src1; // Input1 tensor
struct htp_tensor src2; // Input2 tensor
struct htp_tensor src3; // Input3 tensor
struct htp_tensor src4; // Input4 tensor
struct htp_tensor dst; // Output tensor
// should be multiple of 64 bytes (cacheline)
};
struct htp_general_rsp {
uint32_t op; // GGML/HTP Op
uint32_t status; // HTP_STATUS_...
uint32_t prof_usecs; // Number of usec per request
uint32_t prof_cycles; // Number of cycles per request
uint32_t prof_pkts; // Number of instruction packets per request
uint8_t unused[44]; // Pad to 64 bytes
};
#define HTP_MAX_MESSAGE_SIZE sizeof(struct htp_general_req)
#define HTP_MAX_PACKET_BUFFERS 8
#endif /* HTP_MSG_H */

View File

@@ -1,154 +1,65 @@
#ifndef HTP_OPS_H
#define HTP_OPS_H
#include "htp-ctx.h"
#include "htp-msg.h"
#include "worker-pool.h"
#include <assert.h>
#include <stdint.h>
// ggml-common.h must be included prio to this header
#include <hex-fastdiv.h>
enum htp_status {
HTP_STATUS_OK = 1,
HTP_STATUS_INTERNAL_ERR = 2,
HTP_STATUS_NO_SUPPORT = 3,
HTP_STATUS_INVAL_PARAMS = 4,
HTP_STATUS_VTCM_TOO_SMALL = 5,
// ggml-common.h must be included prior to this header
struct htp_spad {
uint8_t * data;
size_t stride;
size_t size;
size_t size_per_thread;
};
// First set of values must match the ggml_type.
// Duplicated here because we can't include full ggml.h in the htp build.
// We have some static_asserts in the cpp code to ensure things are in sync.
enum htp_data_type {
HTP_TYPE_F32 = 0,
HTP_TYPE_F16 = 1,
HTP_TYPE_Q4_0 = 2,
HTP_TYPE_Q8_0 = 8,
HTP_TYPE_IQ4_NL = 20,
HTP_TYPE_I32 = 26,
HTP_TYPE_I64 = 27,
HTP_TYPE_MXFP4 = 39,
struct htp_ops_context {
struct htp_context * ctx;
// types used internally for repack, dyn.quant, etc
HTP_TYPE_Q4_0x4x2 = 200,
HTP_TYPE_Q8_0x4x2,
HTP_TYPE_MXFP4x4x2,
enum htp_op op;
int32_t op_params[HTP_MAX_OP_PARAMS / sizeof(int32_t)];
HTP_TYPE_INVALID
struct htp_tensor src0;
struct htp_tensor src1;
struct htp_tensor src2;
struct htp_tensor src3;
struct htp_tensor src4;
struct htp_tensor dst;
struct htp_spad src0_spad;
struct htp_spad src1_spad;
struct htp_spad src2_spad;
struct htp_spad src3_spad;
struct htp_spad dst_spad;
worker_pool_context_t * wpool; // worker pool
uint32_t n_threads; // num threads
uint32_t flags;
};
// Constats for internal types
#define QK_Q4_0x4x2 256 // 4x Q4_0 blocks packed with next 4x Q4_0 blocks (size in bytes 128)
#define QK_Q8_0x4x2 256 // 4x Q8_0 blocks concat with next 4x Q8_0 blocks
#define QK_MXFP4x4x2 256 // 4x MXFP4 blocks concat with next 4x MXFP4 blocks
// Mask to enable various stages of the Ops.
// Used for debugging and profiling.
enum htp_op_mask {
HTP_OPMASK_QUEUE = (1 << 0), // Enable Queueing (ie calls into the DSP)
HTP_OPMASK_COMPUTE = (1 << 1), // Enable Compute
};
// Do not reorder first 4 (used as an index)
enum htp_op_code {
HTP_OP_MUL = 0,
HTP_OP_ADD = 1,
HTP_OP_SUB = 2,
HTP_OP_DIV = 3,
HTP_OP_MUL_MAT,
HTP_OP_MUL_MAT_ID,
HTP_OP_RMS_NORM,
HTP_OP_UNARY_SILU,
HTP_OP_UNARY_GELU,
HTP_OP_UNARY_SIGMOID,
HTP_OP_UNARY_EXP,
HTP_OP_UNARY_NEG,
HTP_OP_UNARY_SOFTPLUS,
HTP_OP_GLU_SWIGLU,
HTP_OP_GLU_SWIGLU_OAI,
HTP_OP_GLU_GEGLU,
HTP_OP_SOFTMAX,
HTP_OP_ADD_ID,
HTP_OP_ROPE,
HTP_OP_FLASH_ATTN_EXT,
HTP_OP_SET_ROWS,
HTP_OP_GET_ROWS,
HTP_OP_SCALE,
HTP_OP_CPY,
HTP_OP_ARGSORT,
HTP_OP_SQR,
HTP_OP_SQRT,
HTP_OP_SUM_ROWS,
HTP_OP_SSM_CONV,
HTP_OP_REPEAT,
HTP_OP_CUMSUM,
HTP_OP_INVALID
};
#define HTP_OP_MAX_DIMS 4 // aka GGML_MAX_DIMS
#define HTP_OP_MAX_INPUTS 6 // aka GGML_MAX_SRCS
#define HTP_OP_MAX_PARAMS 16 // aka GGML_MAX_OP_PARAMS
#define HTP_OP_MAX_BUFS 8
#define HTP_OP_MAX_REQS 256
#define HTP_OP_MAX_TENSORS (HTP_OP_MAX_REQS * HTP_OP_MAX_INPUTS + HTP_OP_MAX_REQS)
#define HTP_OP_MAX_VMEM (3221225472u)
enum htp_tensor_flags {
HTP_TENSOR_COMPUTE = (1U << 0), // Tensor buffer temporal compute data (not weights)
HTP_TENSOR_FLUSHED = (1U << 1) // Tensor buffer has been flushed (set by the NPU)
};
// Tensor descriptor
struct htp_tensor {
uint32_t data; // Buffer offset in the messages, and data pointer on the NPU
uint32_t size; // Data size in bytes
uint32_t flags; // Buffer / tensor flags
uint16_t type; // Data type
uint16_t bi; // Buffer index
uint32_t ne[HTP_OP_MAX_DIMS]; // Number of elements
uint32_t nb[HTP_OP_MAX_DIMS]; // Stride in bytes (see ggml.h ggml_tensor)
};
// Buffer descriptor
struct htp_buf_desc {
uint64_t base; // base address
uint64_t size; // total size
uint32_t flags; // buffer flags (unused)
uint32_t fd; // file descriptor
};
enum htp_op_flags {
HTP_OPFLAGS_SKIP_COMPUTE = (1U << 0), // Skip actual computation (used for profiling)
};
// Op descriptor
struct htp_op_desc {
uint32_t opcode; // GGML/HTP Op
uint32_t flags; // Op flags
int32_t params[HTP_OP_MAX_PARAMS]; // Params for the op, e.g. epsilon of RMS norm
uint16_t src[HTP_OP_MAX_INPUTS]; // Input tensors indices
uint16_t dst; // Output tensor index
// the rest is filled in-place by the NPU
uint32_t prof_usecs; // Number of usec per request
uint32_t prof_cycles; // Number of cycles per request
uint32_t prof_pkts; // Number of instruction packets per request
uint32_t unused;
};
struct htp_opbatch_req {
uint32_t n_bufs; // Number of buffers
uint32_t n_tensors; // Number of tensors
uint32_t n_ops; // Number of ops
uint32_t flags; // unused
// struct htp_buf_desc bufs[]; -- dspqueue buf 0
// struct htp_tensor tensors[]; -- dspqueue buf 0
// struct htp_op_desc ops[]; -- dspqueue buf 0
};
struct htp_opbatch_rsp {
uint32_t status; // HTP_STATUS_...
// struct htp_op_req ops[]; -- dspqueue buf 0
};
int op_matmul(struct htp_ops_context * octx);
int op_matmul_id(struct htp_ops_context * octx);
int op_binary(struct htp_ops_context * octx);
int op_unary(struct htp_ops_context * octx);
int op_sum_rows(struct htp_ops_context * octx);
int op_activations(struct htp_ops_context * octx);
int op_softmax(struct htp_ops_context * octx);
int op_add_id(struct htp_ops_context * octx);
int op_rope(struct htp_ops_context * octx);
int op_flash_attn_ext(struct htp_ops_context * octx);
int op_set_rows(struct htp_ops_context * octx);
int op_get_rows(struct htp_ops_context * octx);
int op_cpy(struct htp_ops_context * octx);
int op_repeat(struct htp_ops_context * octx);
int op_argsort(struct htp_ops_context * octx);
int op_ssm_conv(struct htp_ops_context * octx);
int op_cumsum(struct htp_ops_context * octx);
#endif /* HTP_OPS_H */

View File

@@ -9,8 +9,6 @@
interface htp_iface : remote_handle64 {
AEEResult start(in uint32 sess_id, in uint64 dsp_queue_id, in uint32 n_hvx, in uint32 use_hmx);
AEEResult stop();
AEEResult mmap(in uint32 fd, in uint32 size, in uint32 pinned);
AEEResult munmap(in uint32 fd);
AEEResult enable_etm();
AEEResult disable_etm();
};

File diff suppressed because it is too large Load Diff

View File

@@ -16,9 +16,8 @@
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "htp-ctx.h"
#include "htp-msg.h"
#include "htp-ops.h"
#include "htp-ops.h"
#include "hmx-ops.h"
#define MM_SPAD_SRC0_NROWS 16
#define MM_SPAD_SRC1_NROWS 16
@@ -1898,11 +1897,11 @@ static void vec_dot_f16_f32_uu_1x1(const int n, float * restrict s, const void *
hvx_vec_store_u(&s[0], 4, rsum);
}
#define htp_matmul_tensors_preamble \
const struct htp_tensor * restrict src0 = octx->src[0]; \
const struct htp_tensor * restrict src1 = octx->src[1]; \
const struct htp_tensor * restrict src2 = octx->src[2]; \
const struct htp_tensor * restrict dst = octx->dst; \
#define htp_matmul_tensors_preamble \
struct htp_tensor * restrict src0 = &octx->src0; \
struct htp_tensor * restrict src1 = &octx->src1; \
struct htp_tensor * restrict src2 = &octx->src2; \
struct htp_tensor * restrict dst = &octx->dst; \
struct htp_spad * restrict src0_spad = &octx->src0_spad; \
struct htp_spad * restrict src1_spad = &octx->src1_spad; \
struct htp_spad * restrict dst_spad = &octx->dst_spad; \
@@ -2224,8 +2223,8 @@ struct mmid_row_mapping {
static void matmul_id(unsigned int nth, unsigned int ith, void * data) {
htp_matmul_preamble;
const struct htp_tensor * restrict ids = octx->src[2];
struct htp_spad * restrict src2_spad = &octx->src2_spad;
struct htp_tensor * restrict ids = &octx->src2;
struct htp_spad * restrict src2_spad = &octx->src2_spad;
uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();
@@ -2343,8 +2342,8 @@ static void matmul_id(unsigned int nth, unsigned int ith, void * data) {
static void matvec_id(unsigned int nth, unsigned int ith, void * data) {
htp_matmul_preamble;
const struct htp_tensor * restrict ids = octx->src[2];
struct htp_spad * restrict src2_spad = &octx->src2_spad;
struct htp_tensor * restrict ids = &octx->src2;
struct htp_spad * restrict src2_spad = &octx->src2_spad;
uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();
@@ -2613,7 +2612,7 @@ static void quantize_f32_q8x4x2(unsigned int nth, unsigned int ith, void * data)
struct htp_matmul_context * mmctx = data;
struct htp_ops_context * octx = mmctx->octx;
const struct htp_tensor * src = octx->src[1];
const struct htp_tensor * src = &octx->src1;
uint8_t * restrict dst = octx->src1_spad.data;
struct htp_spad * spad = &octx->src0_spad;
uint32_t nrows_per_thread = mmctx->src1_nrows_per_thread;
@@ -2660,7 +2659,7 @@ static void quantize_f32_f16(unsigned int nth, unsigned int ith, void * data) {
struct htp_matmul_context * mmctx = data;
struct htp_ops_context * octx = mmctx->octx;
const struct htp_tensor * src = octx->src[1];
const struct htp_tensor * src = &octx->src1;
uint8_t * restrict dst = octx->src1_spad.data;
uint32_t nrows_per_thread = mmctx->src1_nrows_per_thread;
uint32_t dst_stride = octx->src1_spad.stride;
@@ -2702,7 +2701,7 @@ static void quantize_f16_f16(unsigned int nth, unsigned int ith, void * data) {
struct htp_matmul_context * mmctx = data;
struct htp_ops_context * octx = mmctx->octx;
const struct htp_tensor * src = octx->src[1];
const struct htp_tensor * src = &octx->src1;
uint8_t * restrict dst = octx->src1_spad.data;
uint32_t nrows_per_thread = mmctx->src1_nrows_per_thread;
uint32_t dst_stride = octx->src1_spad.stride;
@@ -2801,7 +2800,7 @@ static void htp_mminit_spad(struct htp_ops_context * octx,
octx->dst_spad.size = octx->dst_spad.size_per_thread * octx->n_threads;
}
static int op_matmul_hvx(struct htp_ops_context * octx) {
int op_matmul(struct htp_ops_context * octx) {
htp_matmul_tensors_preamble;
struct htp_matmul_context mmctx_struct = {0};
@@ -2825,7 +2824,7 @@ static int op_matmul_hvx(struct htp_ops_context * octx) {
worker_callback_t quant_job_func;
worker_callback_t matmul_job_func = src1_nrows > 1 ? matmul_2d : matvec_2d;
bool need_quant = true;
bool need_quant = !(octx->flags & HTP_OPFLAGS_SKIP_QUANTIZE);
if (src0->type == HTP_TYPE_F16) {
// Try optimized f16-f16 path first (src1 in VTCM)
@@ -2839,7 +2838,7 @@ static int op_matmul_hvx(struct htp_ops_context * octx) {
// Default matmul implementation does not support multi-batch src0 (N-vs-N broadcasting).
// It only supports 1-vs-N broadcasting (src0 is 2D) or standard 2D matmul.
const bool is_batched = (ne02 > 1) || (ne03 > 1);
const bool is_permuted = htp_is_permuted(octx->src[0]) || htp_is_permuted(octx->src[1]);
const bool is_permuted = htp_is_permuted(&octx->src0) || htp_is_permuted(&octx->src1);
if (!is_batched && !is_permuted && f16_total_size <= octx->ctx->vtcm_size) {
// Optimized path
@@ -2916,172 +2915,34 @@ static int op_matmul_hvx(struct htp_ops_context * octx) {
return HTP_STATUS_VTCM_TOO_SMALL;
}
// Place src1 spad first. We use it for dyn.quant and may reuse between ops
octx->src1_spad.data = octx->ctx->vtcm_base;
octx->src0_spad.data = octx->src1_spad.data + octx->src1_spad.size;
octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size;
octx->src1_spad.src = (src1 == octx->src1_spad.src) ? src1 : NULL;
octx->src0_spad.src = NULL;
octx->dst_spad.src = NULL;
octx->src0_spad.data = octx->ctx->vtcm_base;
octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size;
octx->src0_spad.stride = src0_row_size_padded;
octx->src1_spad.stride = src1_row_size;
if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)
return HTP_STATUS_OK;
if (need_quant && !octx->src1_spad.src) {
if (need_quant) {
const uint32_t n_quant_jobs = MIN(src1_nrows, octx->n_threads);
mmctx->src1_nrows_per_thread = (src1_nrows + n_quant_jobs - 1) / n_quant_jobs;
worker_pool_run_func(octx->ctx->worker_pool, quant_job_func, mmctx, n_quant_jobs);
octx->src1_spad.src = src1;
// Cache where src1 was written so subsequent SKIP_QUANTIZE ops can find it
octx->ctx->prev_src1_spad = octx->src1_spad.data;
} else {
// SKIP_QUANTIZE: Q8 data lives at the address written by the previous
// quantize pass. The current op may have a different src0 size (e.g.
// IQ4_NL vs MXFP4), so src1_spad.data computed above could be wrong.
octx->src1_spad.data = octx->ctx->prev_src1_spad;
}
const uint32_t n_matmul_jobs = octx->n_threads;
worker_pool_run_func(octx->ctx->worker_pool, matmul_job_func, mmctx, n_matmul_jobs);
if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
const uint32_t n_matmul_jobs = octx->n_threads;
worker_pool_run_func(octx->ctx->worker_pool, matmul_job_func, mmctx, n_matmul_jobs);
}
return HTP_STATUS_OK;
}
int op_matmul(struct htp_ops_context * octx) {
htp_matmul_tensors_preamble;
#ifndef HTP_HAS_HMX
return op_matmul_hvx(octx);
#else
if (!octx->ctx->hmx_enabled) {
return op_matmul_hvx(octx);
}
// HMX weight tile requires N to be 32-aligned.
if (src0->ne[1] % 32 != 0) {
return op_matmul_hvx(octx);
}
// HMX supports F16, Q4_0, Q8_0, IQ4_NL, MXFP4 weights.
// Other types fall back to HVX.
uint32_t wtype = src0->type;
if (wtype != HTP_TYPE_F16 && wtype != HTP_TYPE_Q4_0 && wtype != HTP_TYPE_Q8_0 && wtype != HTP_TYPE_IQ4_NL && wtype != HTP_TYPE_MXFP4) {
return op_matmul_hvx(octx);
}
// Quantised HMX path requires K aligned to 256 (x4x2 super-block).
// F16 HMX path requires K aligned to 32 (tile width).
if (wtype != HTP_TYPE_F16 && src0->ne[0] % 256 != 0) {
return op_matmul_hvx(octx);
}
if (wtype == HTP_TYPE_F16 && src0->ne[0] % 32 != 0) {
return op_matmul_hvx(octx);
}
const bool is_batched = (src0->ne[2] * src0->ne[3] > 1 || src1->ne[2] * src1->ne[3] > 1);
// Quantised HMX kernels only handle flat 2D matmul (host already rejects
// batched quantised, but guard here too). F16 batched matmul is handled
// by the dedicated wrapper in hmx-matmul-ops.c.
if (is_batched && src0->type != HTP_TYPE_F16) {
return op_matmul_hvx(octx);
}
// HMX assumes contiguous row-major layout. Fall back for permuted
// tensors where strides are non-monotonic (e.g. transposed KV cache).
if (src0->nb[0] > src0->nb[1] || src1->nb[0] > src1->nb[1]) {
return op_matmul_hvx(octx);
}
// M alignment: when M > 32 but not 32-aligned, we split into
// HMX (first m_hmx = M & ~31 rows) + HVX (remaining m_tail rows).
// When M <= 32 and not 32-aligned, fall back entirely to HVX.
const int m_total = (int) src1->ne[1];
const int m_tail = m_total % 32;
const int m_hmx = m_total - m_tail;
if (m_hmx == 0) {
return op_matmul_hvx(octx);
}
// Always re-quantize src1 since HMX kernel overwrites vtcm/spad,
// so any previously cached quantized data is invalid.
octx->src1_spad.src = NULL;
int k = (int) src0->ne[0]; // inner dimension
int n = (int) src0->ne[1]; // weight columns
// --- Phase 1: HMX on the first m_hmx (32-aligned) rows ---
int ret = -1;
// Row strides in elements. For compact tensors these equal k; for
// permuted attention views they can be larger, so pass the real stride.
const int act_stride = (int)(src1->nb[1] / sizeof(float));
const int wgt_stride = (int)(src0->nb[1] / sizeof(__fp16));
if (src0->type == HTP_TYPE_F16) {
if (is_batched) {
hmx_matmul_w16a32_batched_params_t batch_params = {
.dst = (float *) dst->data,
.activation = (float *) src1->data,
.permuted_weight = (const __fp16 *) src0->data,
.m = m_hmx,
.k = k,
.n = n,
.act_stride = act_stride,
.weight_stride = wgt_stride,
.dst_stride = (int) (dst->nb[1] / sizeof(float)),
.ne02 = ne02,
.ne03 = ne03,
.ne12 = ne12,
.ne13 = ne13,
.src0_nb2 = src0->nb[2],
.src0_nb3 = src0->nb[3],
.src1_nb2 = src1->nb[2],
.src1_nb3 = src1->nb[3],
.dst_nb2 = dst->nb[2],
.dst_nb3 = dst->nb[3],
};
ret = hmx_mat_mul_permuted_w16a32_batched(octx->ctx, &batch_params);
} else {
ret = hmx_mat_mul_permuted_w16a32(octx->ctx,
(float*) dst->data, (float*) src1->data, (const __fp16 *) src0->data,
m_hmx, k, n, act_stride, wgt_stride);
}
} else {
ret = hmx_mat_mul_permuted_qk_0_d16a32(octx->ctx,
(float*) dst->data, (float*) src1->data, (const uint8_t *) src0->data,
m_hmx, k, n, (int) src0->type);
}
if (ret != 0) {
FARF(HIGH, "HMX matmul failed (ret=%d), falling back to HVX", ret);
return op_matmul(octx);
}
// --- Phase 2: HVX on the remaining m_tail rows ---
if (m_tail > 0) {
// copy of src1 and dst
struct htp_tensor src1_tail = *src1;
struct htp_tensor dst_tail = *dst;
src1_tail.ne[1] = m_tail; // only tail rows
dst_tail.ne[1] = m_tail; // only tail rows
// Offset activation and dst pointers past the HMX-processed rows.
// Use nb[1] (row stride in bytes) to compute the byte offset.
src1_tail.data += (uint32_t) m_hmx * src1->nb[1];
dst_tail.data += (uint32_t) m_hmx * dst->nb[1];
octx->src[1] = &src1_tail;
octx->dst = &dst_tail;
FARF(HIGH, "hmx-matmul: HVX tail m_tail %d src1 %p dst %p", m_tail, (void *) src1_tail.data, (void *) dst_tail.data);
return op_matmul_hvx(octx);
}
return 0;
#endif // HTP_HAS_HMX
}
int op_matmul_id(struct htp_ops_context * octx) {
htp_matmul_tensors_preamble;
@@ -3089,7 +2950,7 @@ int op_matmul_id(struct htp_ops_context * octx) {
struct htp_matmul_context * mmctx = &mmctx_struct;
mmctx->octx = octx;
const struct htp_tensor * restrict ids = octx->src[2];
struct htp_tensor * restrict ids = &octx->src2;
const size_t src0_row_size = nb01;
const size_t dst_row_size = nb1;
@@ -3142,17 +3003,11 @@ int op_matmul_id(struct htp_ops_context * octx) {
return HTP_STATUS_VTCM_TOO_SMALL;
}
// Place src1 spad first. We use it for dyn.quant and may reuse in subseq ops.
octx->src1_spad.data = octx->ctx->vtcm_base;
octx->src0_spad.data = octx->src1_spad.data + octx->src1_spad.size;
octx->src2_spad.data = octx->src0_spad.data + octx->src0_spad.size;
octx->src0_spad.data = octx->ctx->vtcm_base;
octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
octx->src2_spad.data = octx->src1_spad.data + octx->src1_spad.size;
octx->dst_spad.data = octx->src2_spad.data + octx->src2_spad.size;
octx->src1_spad.src = (src1 == octx->src1_spad.src) ? src1 : NULL;
octx->src0_spad.src = NULL;
octx->src2_spad.src = NULL;
octx->dst_spad.src = NULL;
octx->src0_spad.stride = src0_row_size_padded;
octx->src1_spad.stride = src1_row_size;
@@ -3176,18 +3031,20 @@ int op_matmul_id(struct htp_ops_context * octx) {
}
}
if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)
return HTP_STATUS_OK;
if (octx->src1_spad.src != src1) {
// Setup worker pool callbacks
if (!(octx->flags & HTP_OPFLAGS_SKIP_QUANTIZE)) {
const uint32_t n_quant_jobs = MIN(src1_nrows, octx->n_threads);
mmctx->src1_nrows_per_thread = (src1_nrows + n_quant_jobs - 1) / n_quant_jobs;
worker_pool_run_func(octx->ctx->worker_pool, quant_job_func, mmctx, n_quant_jobs);
octx->src1_spad.src = src1;
octx->ctx->prev_src1_spad = octx->src1_spad.data;
} else {
octx->src1_spad.data = octx->ctx->prev_src1_spad;
}
const uint32_t n_matmul_jobs = octx->n_threads;
worker_pool_run_func(octx->ctx->worker_pool, matmul_id_job_func, mmctx, n_matmul_jobs);
if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
const uint32_t n_matmul_jobs = octx->n_threads;
worker_pool_run_func(octx->ctx->worker_pool, matmul_id_job_func, mmctx, n_matmul_jobs);
}
return HTP_STATUS_OK;
}

View File

@@ -12,7 +12,7 @@
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "htp-ctx.h"
#include "htp-ops.h"
#include "htp-msg.h"
#include "htp-ops.h"
struct htp_repeat_context {
@@ -32,8 +32,8 @@ struct htp_repeat_context {
static void repeat_job_per_thread(unsigned int nth, unsigned int ith, void * data) {
const struct htp_repeat_context * rctx = (const struct htp_repeat_context *) data;
struct htp_ops_context * octx = rctx->octx;
const struct htp_tensor * src = octx->src[0];
const struct htp_tensor * dst = octx->dst;
const struct htp_tensor * src = &octx->src0;
const struct htp_tensor * dst = &octx->dst;
const uint32_t ne00 = src->ne[0];
const uint32_t ne01 = src->ne[1];
@@ -98,8 +98,8 @@ static void repeat_job_per_thread(unsigned int nth, unsigned int ith, void * dat
}
int op_repeat(struct htp_ops_context * octx) {
const struct htp_tensor * src0 = octx->src[0];
const struct htp_tensor * dst = octx->dst;
const struct htp_tensor * src0 = &octx->src0;
struct htp_tensor * dst = &octx->dst;
// Validate that dst dims are multiples of src dims
if (dst->ne[0] % src0->ne[0] != 0 ||

View File

@@ -15,7 +15,7 @@
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "htp-ctx.h"
#include "htp-ops.h"
#include "htp-msg.h"
#include "htp-ops.h"
// Redefined the types GGML_ROPE_TYPE_NORMAL & GGML_ROPE_TYPE_NEOX as we can't include ggml.h
@@ -253,10 +253,10 @@ static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) {
struct htp_rope_context * rctx = (struct htp_rope_context *) data;
struct htp_ops_context * octx = rctx->octx;
const struct htp_tensor * src0 = octx->src[0];
const struct htp_tensor * src1 = octx->src[1];
const struct htp_tensor * src2 = octx->src[2];
const struct htp_tensor * dst = octx->dst;
const struct htp_tensor * src0 = &octx->src0;
const struct htp_tensor * src1 = &octx->src1;
const struct htp_tensor * src2 = &octx->src2;
struct htp_tensor * dst = &octx->dst;
htp_rope_preamble;
@@ -284,7 +284,7 @@ static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) {
dma_queue * dma_queue = octx->ctx->dma[ith];
const int32_t * pos = (const int32_t *) src1->data;
const float * freq_factors = src2 ? (const float *) src2->data : NULL;
const float * freq_factors = src2->data ? (const float *) src2->data : NULL;
uint32_t ir = 0;
uint32_t prev_i2 = (uint32_t) -1;
@@ -384,10 +384,10 @@ done:
static int execute_op_rope_f32(struct htp_ops_context * octx) {
int err = HTP_STATUS_OK;
const struct htp_tensor * src0 = octx->src[0];
const struct htp_tensor * src1 = octx->src[1];
const struct htp_tensor * src2 = octx->src[2];
const struct htp_tensor * dst = octx->dst;
const struct htp_tensor * src0 = &octx->src0;
const struct htp_tensor * src1 = &octx->src1;
const struct htp_tensor * src2 = &octx->src2;
struct htp_tensor * dst = &octx->dst;
const char * op_type = "rope-f32";
@@ -424,16 +424,19 @@ static int execute_op_rope_f32(struct htp_ops_context * octx) {
return HTP_STATUS_VTCM_TOO_SMALL;
}
// Assign sizes
octx->src0_spad.size_per_thread = src0_spad_per_thread;
octx->dst_spad.size_per_thread = dst_spad_per_thread;
octx->src0_spad.size = n_threads * src0_spad_per_thread;
octx->dst_spad.size = n_threads * dst_spad_per_thread;
octx->src1_spad.size = 0;
octx->src0_spad.data = octx->ctx->vtcm_base; octx->src0_spad.src = NULL;
octx->src1_spad.data = NULL; octx->src1_spad.src = NULL;
octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size; octx->dst_spad.src = NULL;
// Assign pointers
octx->src0_spad.data = octx->ctx->vtcm_base;
octx->src1_spad.data = NULL;
octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size;
// Fill context
struct htp_rope_context rctx;
memset(&rctx, 0, sizeof(struct htp_rope_context));
@@ -480,7 +483,7 @@ static int execute_op_rope_f32(struct htp_ops_context * octx) {
int op_rope(struct htp_ops_context * octx) {
int err = HTP_STATUS_OK;
switch (octx->src[0]->type) {
switch (octx->src0.type) {
case HTP_TYPE_F32:
err = execute_op_rope_f32(octx);
break;

View File

@@ -14,37 +14,33 @@
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "htp-ctx.h"
#include "htp-ops.h"
#include "htp-msg.h"
#include "htp-ops.h"
#define set_rows_preamble \
const uint32_t ne00 = octx->src[0]->ne[0]; \
const uint32_t ne01 = octx->src[0]->ne[1]; \
const uint32_t ne02 = octx->src[0]->ne[2]; \
const uint32_t ne03 = octx->src[0]->ne[3]; \
\
const uint32_t ne10 = octx->src[1]->ne[0]; \
const uint32_t ne11 = octx->src[1]->ne[1]; \
const uint32_t ne12 = octx->src[1]->ne[2]; \
const uint32_t ne13 = octx->src[1]->ne[3]; \
\
const uint32_t nb01 = octx->src[0]->nb[1]; \
const uint32_t nb02 = octx->src[0]->nb[2]; \
const uint32_t nb03 = octx->src[0]->nb[3]; \
\
const uint32_t nb10 = octx->src[1]->nb[0]; \
const uint32_t nb11 = octx->src[1]->nb[1]; \
const uint32_t nb12 = octx->src[1]->nb[2]; \
\
const uint32_t nb1 = octx->dst->nb[1]; \
const uint32_t nb2 = octx->dst->nb[2]; \
const uint32_t nb3 = octx->dst->nb[3]; \
\
const uint32_t ne0 = octx->dst->ne[0]; \
const uint32_t ne1 = octx->dst->ne[1]; \
const uint32_t ne2 = octx->dst->ne[2]; \
const uint32_t ne3 = octx->dst->ne[3]; \
\
#define set_rows_preamble \
const uint32_t ne00 = octx->src0.ne[0]; \
const uint32_t ne01 = octx->src0.ne[1]; \
const uint32_t ne02 = octx->src0.ne[2]; \
const uint32_t ne03 = octx->src0.ne[3]; \
\
const uint32_t ne10 = octx->src1.ne[0]; \
const uint32_t ne11 = octx->src1.ne[1]; \
const uint32_t ne12 = octx->src1.ne[2]; \
\
const uint32_t nb01 = octx->src0.nb[1]; \
const uint32_t nb02 = octx->src0.nb[2]; \
const uint32_t nb03 = octx->src0.nb[3]; \
\
const uint32_t nb10 = octx->src1.nb[0]; \
const uint32_t nb11 = octx->src1.nb[1]; \
const uint32_t nb12 = octx->src1.nb[2]; \
\
const uint32_t nb1 = octx->dst.nb[1]; \
const uint32_t nb2 = octx->dst.nb[2]; \
const uint32_t nb3 = octx->dst.nb[3]; \
\
const uint32_t ne1 = octx->dst.ne[1]; \
\
const uint32_t nr = ne01;
struct htp_set_rows_context {
@@ -60,14 +56,12 @@ static void set_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *da
set_rows_preamble;
uint64_t qt = HAP_perf_get_qtimer_count();
// parallelize by rows of src0
const uint32_t dr = srctx->src0_nrows_per_thread;
const uint32_t ir0 = dr * ith;
const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr;
const bool is_i32 = (octx->src[1]->type == HTP_TYPE_I32);
const bool is_i32 = (octx->src1.type == HTP_TYPE_I32);
for (uint32_t i03 = 0; i03 < ne03; ++i03) {
for (uint32_t i02 = 0; i02 < ne02; ++i02) {
@@ -76,7 +70,7 @@ static void set_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *da
const uint32_t i11 = fastmodulo(i02, ne11, &srctx->div_ne11);
const uint32_t i10 = i;
const uintptr_t src1_addr = octx->src[1]->data + i10*nb10 + i11*nb11 + i12*nb12;
const uintptr_t src1_addr = octx->src1.data + i10*nb10 + i11*nb11 + i12*nb12;
uint32_t i1 = is_i32 ? *(int32_t *)src1_addr : *(int64_t *)src1_addr;
if (i1 >= ne1) {
@@ -84,18 +78,14 @@ static void set_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *da
continue;
}
const uintptr_t src0_ptr = octx->src[0]->data + i*nb01 + i02*nb02 + i03*nb03;
const uintptr_t dst_ptr = octx->dst->data + i1*nb1 + i02*nb2 + i03*nb3;
const uintptr_t src0_ptr = octx->src0.data + i*nb01 + i02*nb02 + i03*nb03;
const uintptr_t dst_ptr = octx->dst.data + i1*nb1 + i02*nb2 + i03*nb3;
// copy row
hvx_copy_f32_uu((uint8_t *)dst_ptr, (const uint8_t *)src0_ptr, ne00);
}
}
}
qt = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - qt);
FARF(HIGH, "set-rows-f32-f32 %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
ne00, ne01, ne02, ne03, ir0, ir1, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, (unsigned) qt);
}
static void set_rows_thread_f16_f32(unsigned int nth, unsigned int ith, void *data) {
@@ -104,14 +94,12 @@ static void set_rows_thread_f16_f32(unsigned int nth, unsigned int ith, void *da
set_rows_preamble;
uint64_t qt = HAP_perf_get_qtimer_count();
// parallelize by rows of src0
const uint32_t dr = srctx->src0_nrows_per_thread;
const uint32_t ir0 = dr * ith;
const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr;
const bool is_i32 = (octx->src[1]->type == HTP_TYPE_I32);
const bool is_i32 = (octx->src1.type == HTP_TYPE_I32);
for (uint32_t i03 = 0; i03 < ne03; ++i03) {
for (uint32_t i02 = 0; i02 < ne02; ++i02) {
@@ -120,7 +108,7 @@ static void set_rows_thread_f16_f32(unsigned int nth, unsigned int ith, void *da
const uint32_t i11 = fastmodulo(i02, ne11, &srctx->div_ne11);
const uint32_t i10 = i;
const uintptr_t src1_addr = octx->src[1]->data + i10*nb10 + i11*nb11 + i12*nb12;
const uintptr_t src1_addr = octx->src1.data + i10*nb10 + i11*nb11 + i12*nb12;
uint32_t i1 = is_i32 ? *(int32_t *)src1_addr : *(int64_t *)src1_addr;
if (i1 >= ne1) {
@@ -128,17 +116,13 @@ static void set_rows_thread_f16_f32(unsigned int nth, unsigned int ith, void *da
continue;
}
const uint8_t* src0_ptr = (const uint8_t *) octx->src[0]->data + i*nb01 + i02*nb02 + i03*nb03;
uint8_t* dst_ptr = (uint8_t *) octx->dst->data + i1*nb1 + i02*nb2 + i03*nb3;
const uint8_t* src0_ptr = (const uint8_t *) octx->src0.data + i*nb01 + i02*nb02 + i03*nb03;
uint8_t* dst_ptr = (uint8_t *) octx->dst.data + i1*nb1 + i02*nb2 + i03*nb3;
hvx_copy_f16_f32_uu(dst_ptr, src0_ptr, ne00);
}
}
}
qt = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - qt);
FARF(HIGH, "set-rows-f16-f32 %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
ne00, ne01, ne02, ne03, ir0, ir1, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, (unsigned) qt);
}
int op_set_rows(struct htp_ops_context * octx) {
@@ -146,15 +130,15 @@ int op_set_rows(struct htp_ops_context * octx) {
const uint32_t n_threads = MIN(nr, octx->n_threads);
if (octx->src[0]->type != HTP_TYPE_F32) {
if (octx->src0.type != HTP_TYPE_F32) {
return HTP_STATUS_NO_SUPPORT;
}
if (octx->dst->type != HTP_TYPE_F32 && octx->dst->type != HTP_TYPE_F16) {
if (octx->dst.type != HTP_TYPE_F32 && octx->dst.type != HTP_TYPE_F16) {
return HTP_STATUS_NO_SUPPORT;
}
if (octx->src[1]->type != HTP_TYPE_I32 && octx->src[1]->type != HTP_TYPE_I64) {
if (octx->src1.type != HTP_TYPE_I32 && octx->src1.type != HTP_TYPE_I64) {
return HTP_STATUS_NO_SUPPORT;
}
@@ -169,7 +153,7 @@ int op_set_rows(struct htp_ops_context * octx) {
srctx.src0_nrows_per_thread = (nr + n_threads - 1) / n_threads;
switch(octx->dst->type) {
switch(octx->dst.type) {
case HTP_TYPE_F32:
worker_pool_run_func(octx->ctx->worker_pool, set_rows_thread_f32_f32, &srctx, n_threads);
break;

View File

@@ -15,89 +15,68 @@
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "htp-ctx.h"
#include "htp-ops.h"
#include "htp-msg.h"
#include "htp-ops.h"
#define htp_softmax_preamble3 \
const uint32_t ne00 = src0->ne[0]; \
const uint32_t ne01 = src0->ne[1]; \
const uint32_t ne02 = src0->ne[2]; \
const uint32_t ne03 = src0->ne[3]; \
\
const uint32_t nb00 = src0->nb[0]; \
const uint32_t nb01 = src0->nb[1]; \
const uint32_t nb02 = src0->nb[2]; \
const uint32_t nb03 = src0->nb[3]; \
\
const uint32_t ne10 = src1 ? src1->ne[0] : 1; \
const uint32_t ne11 = src1 ? src1->ne[1] : 1; \
const uint32_t ne12 = src1 ? src1->ne[2] : 1; \
const uint32_t ne13 = src1 ? src1->ne[3] : 1; \
\
const uint32_t nb10 = src1 ? src1->nb[0] : 1; \
const uint32_t nb11 = src1 ? src1->nb[1] : 1; \
const uint32_t nb12 = src1 ? src1->nb[2] : 1; \
const uint32_t nb13 = src1 ? src1->nb[3] : 1; \
\
const uint32_t ne0 = dst->ne[0]; \
const uint32_t ne1 = dst->ne[1]; \
const uint32_t ne2 = dst->ne[2]; \
const uint32_t ne3 = dst->ne[3]; \
\
const uint32_t nb0 = dst->nb[0]; \
const uint32_t nb1 = dst->nb[1]; \
const uint32_t nb2 = dst->nb[2]; \
#define htp_softmax_preamble3 \
const uint32_t ne00 = src0->ne[0]; \
const uint32_t ne01 = src0->ne[1]; \
const uint32_t ne02 = src0->ne[2]; \
const uint32_t ne03 = src0->ne[3]; \
\
const uint32_t nb00 = src0->nb[0]; \
const uint32_t nb01 = src0->nb[1]; \
const uint32_t nb02 = src0->nb[2]; \
const uint32_t nb03 = src0->nb[3]; \
\
const uint32_t ne10 = (src1->ne[0]) ? src1->ne[0] : 1; \
const uint32_t ne11 = (src1->ne[0]) ? src1->ne[1] : 1; \
const uint32_t ne12 = (src1->ne[0]) ? src1->ne[2] : 1; \
const uint32_t ne13 = (src1->ne[0]) ? src1->ne[3] : 1; \
\
const uint32_t nb10 = (src1->ne[0]) ? src1->nb[0] : 1; \
const uint32_t nb11 = (src1->ne[0]) ? src1->nb[1] : 1; \
const uint32_t nb12 = (src1->ne[0]) ? src1->nb[2] : 1; \
const uint32_t nb13 = (src1->ne[0]) ? src1->nb[3] : 1; \
\
const uint32_t ne0 = dst->ne[0]; \
const uint32_t ne1 = dst->ne[1]; \
const uint32_t ne2 = dst->ne[2]; \
const uint32_t ne3 = dst->ne[3]; \
\
const uint32_t nb0 = dst->nb[0]; \
const uint32_t nb1 = dst->nb[1]; \
const uint32_t nb2 = dst->nb[2]; \
const uint32_t nb3 = dst->nb[3];
struct htp_softmax_context {
struct htp_ops_context * octx;
bool use_f16;
bool use_src1;
uint32_t n_head;
uint32_t n_head_log2;
float scale;
float max_bias;
float m0;
float m1;
float scale;
float max_bias;
float m0;
float m1;
uint32_t src0_nrows_per_thread;
struct fastdiv_values fastdiv_ne01;
struct fastdiv_values fastdiv_ne02;
struct fastdiv_values fastdiv_ne12; // For mask broadcasting
struct fastdiv_values fastdiv_ne13; // For mask broadcasting
size_t spad_stride;
uint32_t src0_nrows_per_thread;
struct htp_ops_context * octx;
};
static void apply_mask(float * restrict wp0,
const float * restrict mp_f32,
const __fp16 * restrict mp_f16,
uint32_t ne00,
float slope,
bool use_f16) {
if (!mp_f32) {
return;
}
if (use_f16) {
for (uint32_t i = 0; i < ne00; ++i) {
wp0[i] += slope * (float) mp_f16[i];
}
} else {
for (uint32_t i = 0; i < ne00; ++i) {
wp0[i] += slope * mp_f32[i];
}
}
}
static void init_softmax_ctx(struct htp_softmax_context * smctx, struct htp_ops_context * octx) {
const struct htp_tensor * src0 = octx->src[0];
const struct htp_tensor * src1 = octx->src[1];
const struct htp_tensor * src0 = &octx->src0;
const struct htp_tensor * src1 = &octx->src1;
memset(smctx, 0, sizeof(struct htp_softmax_context));
memcpy(&smctx->scale, (float *) octx->op_params, sizeof(float));
memcpy(&smctx->scale, (float *) octx->op_params, sizeof(float));
memcpy(&smctx->max_bias, (float *) octx->op_params + 1, sizeof(float));
smctx->n_head = src0->ne[2];
@@ -106,8 +85,8 @@ static void init_softmax_ctx(struct htp_softmax_context * smctx, struct htp_ops_
smctx->m0 = powf(2.0f, -(smctx->max_bias) / smctx->n_head_log2);
smctx->m1 = powf(2.0f, -(smctx->max_bias / 2.0f) / smctx->n_head_log2);
smctx->use_src1 = (src1 != 0);
smctx->use_f16 = (src1 != 0) && (src1->type == HTP_TYPE_F16);
smctx->use_src1 = (src1->ne[0] != 0);
smctx->use_f16 = (src1->ne[0] != 0) && (src1->type == HTP_TYPE_F16);
smctx->octx = octx;
@@ -118,8 +97,8 @@ static void init_softmax_ctx(struct htp_softmax_context * smctx, struct htp_ops_
if (ne01 > 0) smctx->fastdiv_ne01 = init_fastdiv_values(ne01);
if (ne02 > 0) smctx->fastdiv_ne02 = init_fastdiv_values(ne02);
const uint32_t ne12 = src1 ? src1->ne[2] : 1;
const uint32_t ne13 = src1 ? src1->ne[3] : 1;
const uint32_t ne12 = (src1->ne[0]) ? src1->ne[2] : 1;
const uint32_t ne13 = (src1->ne[0]) ? src1->ne[3] : 1;
if (ne12 > 0) smctx->fastdiv_ne12 = init_fastdiv_values(ne12);
if (ne13 > 0) smctx->fastdiv_ne13 = init_fastdiv_values(ne13);
@@ -160,7 +139,10 @@ static void hvx_fast_softmax_prep_f32(const uint8_t * restrict src,
}
}
static void hvx_fast_softmax_f32(const uint8_t * restrict src, uint8_t * restrict dst, uint8_t * restrict pad, const int num_elems) {
static void hvx_fast_softmax_f32(const uint8_t * restrict src,
uint8_t * restrict dst,
uint8_t * restrict pad,
const int num_elems) {
const HVX_Vector * restrict v_src = (HVX_Vector *) src;
HVX_Vector * restrict v_pad = (HVX_Vector *) pad;
HVX_Vector * restrict v_dst = (HVX_Vector *) dst;
@@ -206,20 +188,27 @@ static void hvx_fast_softmax_f32(const uint8_t * restrict src, uint8_t * restric
}
}
static float hvx_softmax_f32(const uint8_t * restrict src, uint8_t * restrict dst, uint8_t * restrict spad, const int num_elems, const float max) {
static float hvx_softmax_f32(const uint8_t * restrict src,
uint8_t * restrict dst,
uint8_t * restrict spad,
const int num_elems,
const float max) {
hvx_sub_scalar_f32(spad, src, max, num_elems);
hvx_exp_f32(dst, spad, num_elems, false);
return hvx_reduce_sum_f32(dst, num_elems);
float sum = hvx_reduce_sum_f32(dst, num_elems);
return sum;
}
static void softmax_job_f32(unsigned int nth, unsigned int ith, void * data) {
struct htp_softmax_context * smctx = (struct htp_softmax_context *) data;
struct htp_ops_context * octx = smctx->octx;
const struct htp_tensor * src0 = octx->src[0];
const struct htp_tensor * src1 = octx->src[1];
const struct htp_tensor * dst = octx->dst;
const struct htp_tensor * src0 = &octx->src0;
const struct htp_tensor * src1 = &octx->src1;
struct htp_tensor * dst = &octx->dst;
htp_softmax_preamble3;
@@ -234,26 +223,22 @@ static void softmax_job_f32(unsigned int nth, unsigned int ith, void * data) {
return;
}
uint64_t qt = HAP_perf_get_qtimer_count();
uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();
int is_aligned = 1;
int opt_path = 0;
if (!hex_is_aligned((void *) src0->data, VLEN) || !hex_is_aligned((void *) dst->data, VLEN)) {
is_aligned = 0;
FARF(HIGH, "softmax-f32: unaligned addresses in elementwise op, possibly slower execution\n");
}
// Only use the fast path when aligned AND row size is multiple of VLEN (128 bytes)
// The fast path (hvx_fast_softmax_f32) doesn't handle tail elements
// The non-opt path uses hvx_softmax_f32 which properly handles all sizes via its helper functions
if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
opt_path = 1;
}
uint8_t * src0_spad_data = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
uint8_t * src1_spad_data = octx->src1_spad.data + (ith * octx->src1_spad.size_per_thread);
uint8_t * dst_spad_data = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread);
uint8_t * src0_spad_data = octx->src0_spad.data + (ith * smctx->spad_stride);
uint8_t * src1_spad_data = octx->src1_spad.data + (ith * smctx->spad_stride);
uint8_t * dst_spad_data = octx->dst_spad.data + (ith * smctx->spad_stride);
float * wp0 = (float *) src0_spad_data;
float * wp1 = (float *) src1_spad_data;
@@ -293,29 +278,47 @@ static void softmax_job_f32(unsigned int nth, unsigned int ith, void * data) {
// ALiBi
if (i2 != prev_i2) {
const uint32_t h = i2; // head
slope = (smctx->max_bias > 0.0f) ? h < smctx->n_head_log2 ? powf(smctx->m0, h + 1) : powf(smctx->m1, 2 * (h - smctx->n_head_log2) + 1) : 1.0f;
slope = (smctx->max_bias > 0.0f) ?
h < smctx->n_head_log2 ?
powf(smctx->m0, h + 1) :
powf(smctx->m1, 2 * (h - smctx->n_head_log2) + 1) :
1.0f;
prev_i2 = i2;
}
float * sp = (float *) ((char *) src0->data + i1 * nb01 + i2 * nb02 + i3 * nb03);
float * dp = (float *) ((char *) dst->data + i1 * nb1 + i2 * nb2 + i3 * nb3);
float * sp = (float *) ((char *) octx->src0.data + i1 * nb01 + i2 * nb02 + i3 * nb03);
float * dp = (float *) ((char *) octx->dst.data + i1 * nb1 + i2 * nb2 + i3 * nb3);
// broadcast the mask across rows
__fp16 * mp_f16 = (smctx->use_src1) ? (__fp16 *) ((char *) src1->data + i11 * nb11 + i12 * nb12 + i13 * nb13) : NULL;
float * mp_f32 = (smctx->use_src1) ? (float *) ((char *) src1->data + i11 * nb11 + i12 * nb12 + i13 * nb13) : NULL;
__fp16 * mp_f16 = (smctx->use_src1) ?
(__fp16 *) ((char *) octx->src1.data + i11 * nb11 + i12 * nb12 + i13 * nb13) :
NULL;
float * mp_f32 = (smctx->use_src1) ?
(float *) ((char *) octx->src1.data + i11 * nb11 + i12 * nb12 + i13 * nb13) :
NULL;
if ((1 == opt_path) && (mp_f32) && !(smctx->use_f16)) {
hvx_fast_softmax_prep_f32((const uint8_t *) sp, (uint8_t *) wp0, ne00, smctx->scale, (const uint8_t *) mp_f32, slope);
hvx_fast_softmax_f32((const uint8_t *) wp0, (uint8_t *) dp, (uint8_t *) wp1, ne00);
} else if (1 == opt_path) {
hvx_fast_softmax_prep_f32((const uint8_t *) sp, (uint8_t *) wp0, ne00, smctx->scale,
(const uint8_t *) mp_f32, slope);
} else {
hvx_scale_f32((uint8_t *) wp0, (const uint8_t *) sp, ne00, smctx->scale);
apply_mask(wp0, mp_f32, mp_f16, ne00, slope, smctx->use_f16);
if (mp_f32) {
if (smctx->use_f16) {
for (int i = 0; i < ne00; ++i) {
wp0[i] += slope * (float) mp_f16[i];
}
} else {
for (int i = 0; i < ne00; ++i) {
wp0[i] += slope * mp_f32[i];
}
}
}
}
if (1 == opt_path) {
hvx_fast_softmax_f32((const uint8_t *) wp0, (uint8_t *) dp, (uint8_t *) wp1, ne00);
} else {
// Non-optimized path: uses HVX helper functions that properly handle all tensor sizes
// including non-multiples of 32 (the HVX vector lane count for f32)
hvx_scale_f32((uint8_t *) wp0, (const uint8_t *) sp, ne00, smctx->scale);
apply_mask(wp0, mp_f32, mp_f16, ne00, slope, smctx->use_f16);
float max = hvx_reduce_max_f32((const uint8_t *) wp0, ne00);
float sum = hvx_softmax_f32((const uint8_t *) wp0, (uint8_t *) wp2, (uint8_t *) wp1, ne00, max);
sum = sum > 0.0 ? (1.0 / sum) : 1;
@@ -323,47 +326,54 @@ static void softmax_job_f32(unsigned int nth, unsigned int ith, void * data) {
}
}
qt = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - qt);
FARF(HIGH, "softmax-f32 %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u : opt %u f16 %u usec %u\n", ith, nth,
ne00, ne01, ne02, ne03, src0_start_row, src0_end_row, ne10, ne11, ne12, ne13,
ne0, ne1, ne2, ne3, opt_path, smctx->use_f16, (unsigned) qt);
t2 = HAP_perf_get_qtimer_count();
FARF(HIGH, "softmax-f32 %d/%d/%d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
smctx->use_f16, opt_path, ne00, ne01, ne02, ne03, src0_start_row, src0_end_row, ne10, ne11, ne12, ne13,
ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
}
static int execute_op_softmax_f32(struct htp_ops_context * octx) {
int err = HTP_STATUS_OK;
const struct htp_tensor * src0 = octx->src[0];
const struct htp_tensor * src1 = octx->src[1];
const struct htp_tensor * dst = octx->dst;
const struct htp_tensor * src0 = &octx->src0;
const struct htp_tensor * src1 = &octx->src1;
struct htp_tensor * dst = &octx->dst;
struct htp_softmax_context smctx;
const char * op_type = "softmax-f32";
init_softmax_ctx(&smctx, octx);
switch (octx->op) {
case HTP_OP_SOFTMAX:
init_softmax_ctx(&smctx, octx);
break;
default:
FARF(ERROR, "Unsupported Op %u\n", octx->op);
return HTP_STATUS_NO_SUPPORT;
}
const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
const uint32_t n_threads = MIN(octx->n_threads, src0_nrows);
smctx.src0_nrows_per_thread = (src0_nrows + n_threads - 1) / n_threads;
const size_t src0_row_size = src0->nb[1];
const size_t src1_row_size = src0_row_size;
const size_t dst_row_size = dst->nb[1];
// VTCM scratchpads for all tensors
// 4 rows per thread, padded to HVX vector size
octx->src0_spad.size_per_thread = hex_round_up(4 * src0_row_size, 128);
octx->src1_spad.size_per_thread = hex_round_up(4 * src1_row_size, 128);
octx->dst_spad.size_per_thread = hex_round_up(4 * dst_row_size, 128);
// N rows per thread, padded to HVX vector size
octx->dst_spad.size = hex_round_up(dst_row_size, 128) * n_threads;
octx->src0_spad.size = hex_round_up(src0_row_size, 128) * n_threads;
octx->src1_spad.size = hex_round_up(src1_row_size, 128) * n_threads;
octx->src0_spad.size = octx->src0_spad.size_per_thread * n_threads;
octx->src1_spad.size = octx->src1_spad.size_per_thread * n_threads;
octx->dst_spad.size = octx->dst_spad.size_per_thread * n_threads;
// Use stride for calculating offset
smctx.spad_stride = hex_round_up(src0_row_size, 128);
size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
if (src1) {
FARF(HIGH, "%s: %ux%ux%ux%u x %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
if (src1->ne[0]) {
FARF(HIGH,
"%s: %ux%ux%ux%u x %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], octx->src0_spad.size, octx->src1_spad.size,
octx->dst_spad.size);
@@ -375,17 +385,19 @@ static int execute_op_softmax_f32(struct htp_ops_context * octx) {
// Make sure the reserved vtcm size is sufficient
if (octx->ctx->vtcm_size < spad_size) {
FARF(ERROR, "%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size, spad_size);
FARF(ERROR, "%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size,
spad_size);
return HTP_STATUS_VTCM_TOO_SMALL;
}
octx->src0_spad.data = octx->ctx->vtcm_base; octx->src0_spad.src = NULL;
octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size; octx->src1_spad.src = NULL;
octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size; octx->dst_spad.src = NULL;
octx->src0_spad.data = octx->ctx->vtcm_base;
octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size;
if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) return err;
worker_pool_run_func(octx->ctx->worker_pool, softmax_job_f32, &smctx, n_threads);
if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
smctx.src0_nrows_per_thread = (src0_nrows + n_threads - 1) / n_threads;
worker_pool_run_func(octx->ctx->worker_pool, softmax_job_f32, &smctx, n_threads);
}
return err;
}
@@ -393,7 +405,7 @@ static int execute_op_softmax_f32(struct htp_ops_context * octx) {
int op_softmax(struct htp_ops_context * octx) {
int err = HTP_STATUS_OK;
switch (octx->src[0]->type) {
switch (octx->src0.type) {
case HTP_TYPE_F32:
err = execute_op_softmax_f32(octx);
break;

View File

@@ -16,14 +16,14 @@
#include "ggml-common.h"
#include "htp-ctx.h"
#include "hex-dma.h"
#include "htp-ops.h"
#include "htp-msg.h"
#include "htp-ops.h"
#include "hvx-utils.h"
#define htp_ssm_conv_tensors_preamble \
const struct htp_tensor * restrict src0 = octx->src[0]; \
const struct htp_tensor * restrict src1 = octx->src[1]; \
const struct htp_tensor * restrict dst = octx->dst; \
#define htp_ssm_conv_tensors_preamble \
struct htp_tensor * restrict src0 = &octx->src0; \
struct htp_tensor * restrict src1 = &octx->src1; \
struct htp_tensor * restrict dst = &octx->dst; \
struct htp_spad * restrict src0_spad = &octx->src0_spad; \
struct htp_spad * restrict src1_spad = &octx->src1_spad; \
struct htp_spad * restrict dst_spad = &octx->dst_spad; \
@@ -289,9 +289,9 @@ int op_ssm_conv_f32(struct htp_ops_context * octx) {
// Compute gather scratchpad size for src0 and src1
const size_t gather_spad_size = n_threads * VLEN * 2;
octx->src0_spad.data = octx->ctx->vtcm_base + gather_spad_size; octx->src0_spad.src = NULL;
octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size; octx->src1_spad.src = NULL;
octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size; octx->dst_spad.src = NULL;
octx->src0_spad.data = octx->ctx->vtcm_base + gather_spad_size;
octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size;
FARF(HIGH, "ssm_conv-f32: gather-spad:%zu spad-per-thread:(%u:%u:%u) spad-sizes:(%u:%u:%u) spad-data:(%p:%p:%p)\n",
gather_spad_size, octx->src0_spad.size_per_thread, octx->src1_spad.size_per_thread,
@@ -323,9 +323,8 @@ int op_ssm_conv_f32(struct htp_ops_context * octx) {
}
int op_ssm_conv(struct htp_ops_context * octx) {
const struct htp_tensor * dst = octx->dst;
int err = HTP_STATUS_OK;
int err = HTP_STATUS_OK;
struct htp_tensor * dst = &octx->dst;
switch (dst->type) {
case HTP_TYPE_F32:

View File

@@ -14,13 +14,13 @@
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "htp-ctx.h"
#include "htp-ops.h"
#include "htp-msg.h"
#include "htp-ops.h"
#define sum_rows_preamble \
const struct htp_tensor *src0 = octx->src[0]; \
const struct htp_tensor *dst = octx->dst; \
\
#define sum_rows_preamble \
struct htp_tensor *src0 = &octx->src0;\
struct htp_tensor *dst = &octx->dst; \
\
const uint32_t ne00 = src0->ne[0]; \
const uint32_t ne01 = src0->ne[1]; \
const uint32_t ne02 = src0->ne[2]; \
@@ -94,7 +94,7 @@ static void sum_rows_thread_f32(unsigned int nth, unsigned int ith, void *data)
int op_sum_rows(struct htp_ops_context * octx) {
sum_rows_preamble;
if (octx->src[0]->type != HTP_TYPE_F32) {
if (octx->src0.type != HTP_TYPE_F32) {
return HTP_STATUS_NO_SUPPORT;
}

View File

@@ -16,7 +16,7 @@
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "htp-ctx.h"
#include "htp-ops.h"
#include "htp-msg.h"
#include "htp-ops.h"
struct htp_unary_context {
@@ -267,8 +267,8 @@ static void softplus_f32(const float * restrict src,
static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void * data) {
const struct htp_unary_context * uctx = (const struct htp_unary_context *) data;
struct htp_ops_context * octx = uctx->octx;
const struct htp_tensor * src = octx->src[0];
const struct htp_tensor * dst = octx->dst;
const struct htp_tensor * src = &octx->src0;
const struct htp_tensor * dst = &octx->dst;
htp_unary_preamble;
@@ -387,8 +387,8 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
static int execute_op_unary_f32(struct htp_ops_context * octx) {
int err = HTP_STATUS_OK;
const struct htp_tensor * src0 = octx->src[0];
const struct htp_tensor * dst = octx->dst;
const struct htp_tensor * src0 = &octx->src0;
struct htp_tensor * dst = &octx->dst;
const char * op_type = NULL;
@@ -490,7 +490,7 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
int op_unary(struct htp_ops_context * octx) {
int err = HTP_STATUS_OK;
switch (octx->src[0]->type) {
switch (octx->src0.type) {
case HTP_TYPE_F32:
err = execute_op_unary_f32(octx);
break;

View File

@@ -47,10 +47,6 @@ find_package(hip REQUIRED)
find_package(hipblas REQUIRED)
find_package(rocblas REQUIRED)
if (GGML_HIP_RCCL)
find_package(rccl REQUIRED)
endif()
if (${hip_VERSION} VERSION_LESS 6.1)
message(FATAL_ERROR "At least ROCM/HIP V6.1 is required")
endif()
@@ -122,10 +118,6 @@ if (NOT GGML_HIP_MMQ_MFMA)
add_compile_definitions(GGML_HIP_NO_MMQ_MFMA)
endif()
if (GGML_HIP_RCCL)
add_compile_definitions(GGML_USE_NCCL) # RCCL has the same interface as NCCL.
endif()
if (GGML_HIP_EXPORT_METRICS)
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Rpass-analysis=kernel-resource-usage --save-temps")
endif()
@@ -150,8 +142,4 @@ if (GGML_STATIC)
message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
endif()
if (GGML_HIP_RCCL)
target_link_libraries(ggml-hip PRIVATE ggml-base roc::rccl)
endif()
target_link_libraries(ggml-hip PRIVATE ggml-base hip::host roc::rocblas roc::hipblas)

View File

@@ -736,11 +736,6 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv(ggml_meta
suffix = ne00 % 4 == 0 ? "_4" : "";
}
} break;
case GGML_TYPE_Q1_0:
{
nsg = N_SG_Q1_0;
nr0 = N_R0_Q1_0;
} break;
case GGML_TYPE_Q4_0:
{
nsg = N_SG_Q4_0;
@@ -953,11 +948,6 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_id(ggml_m
smem = 32*sizeof(float)*nr0;
suffix = ne00 % 4 == 0 ? "_4" : "";
} break;
case GGML_TYPE_Q1_0:
{
nsg = N_SG_Q1_0;
nr0 = N_R0_Q1_0;
} break;
case GGML_TYPE_Q4_0:
{
nsg = N_SG_Q4_0;

View File

@@ -1184,7 +1184,6 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
case GGML_TYPE_F16:
case GGML_TYPE_BF16:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q1_0:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
@@ -1211,7 +1210,6 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
default:
return false;
}
case GGML_TYPE_Q1_0:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:

View File

@@ -8,9 +8,6 @@
//
// TODO: for optimal performance, become function of the device and work size
#define N_R0_Q1_0 8
#define N_SG_Q1_0 2
#define N_R0_Q4_0 4
#define N_SG_Q4_0 2

View File

@@ -2047,7 +2047,6 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
op->src[0]->type == GGML_TYPE_F32 || // TODO: helper function
op->src[0]->type == GGML_TYPE_F16 ||
op->src[0]->type == GGML_TYPE_BF16 ||
op->src[0]->type == GGML_TYPE_Q1_0 ||
op->src[0]->type == GGML_TYPE_Q4_0 ||
op->src[0]->type == GGML_TYPE_Q4_1 ||
op->src[0]->type == GGML_TYPE_Q5_0 ||

View File

@@ -90,8 +90,6 @@ static ggml_backend_buffer_i ggml_backend_metal_buffer_shared_i = {
/* .memset_tensor = */ ggml_backend_metal_buffer_shared_memset_tensor,
/* .set_tensor = */ ggml_backend_metal_buffer_shared_set_tensor,
/* .get_tensor = */ ggml_backend_metal_buffer_shared_get_tensor,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ ggml_backend_metal_buffer_shared_cpy_tensor,
/* .clear = */ ggml_backend_metal_buffer_shared_clear,
/* .reset = */ NULL,
@@ -160,17 +158,15 @@ static void ggml_backend_metal_buffer_private_clear(ggml_backend_buffer_t buffer
}
static ggml_backend_buffer_i ggml_backend_metal_buffer_private_i = {
/* .free_buffer = */ ggml_backend_metal_buffer_private_free_buffer,
/* .get_base = */ ggml_backend_metal_buffer_private_get_base,
/* .init_tensor = */ NULL,
/* .memset_tensor = */ ggml_backend_metal_buffer_private_memset_tensor,
/* .set_tensor = */ ggml_backend_metal_buffer_private_set_tensor,
/* .get_tensor = */ ggml_backend_metal_buffer_private_get_tensor,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor = */ ggml_backend_metal_buffer_private_cpy_tensor,
/* .clear = */ ggml_backend_metal_buffer_private_clear,
/* .reset = */ NULL,
/* .free_buffer = */ ggml_backend_metal_buffer_private_free_buffer,
/* .get_base = */ ggml_backend_metal_buffer_private_get_base,
/* .init_tensor = */ NULL,
/* .memset_tensor = */ ggml_backend_metal_buffer_private_memset_tensor,
/* .set_tensor = */ ggml_backend_metal_buffer_private_set_tensor,
/* .get_tensor = */ ggml_backend_metal_buffer_private_get_tensor,
/* .cpy_tensor = */ ggml_backend_metal_buffer_private_cpy_tensor,
/* .clear = */ ggml_backend_metal_buffer_private_clear,
/* .reset = */ NULL,
};
static bool ggml_backend_buffer_is_metal(ggml_backend_buffer_t buffer) {
@@ -567,8 +563,6 @@ static ggml_backend_i ggml_backend_metal_i = {
/* .free = */ ggml_backend_metal_free,
/* .set_tensor_async = */ ggml_backend_metal_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_metal_get_tensor_async,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ ggml_backend_metal_cpy_tensor_async, // only needed for multi-GPU setups
/* .synchronize = */ ggml_backend_metal_synchronize,
/* .graph_plan_create = */ NULL,

View File

@@ -118,56 +118,6 @@ void dequantize_bf16_t4(device const bfloat4 * src, short il, thread type4 & reg
}
#endif
template <typename type4x4>
void dequantize_q1_0(device const block_q1_0 * xb, short il, thread type4x4 & reg) {
device const uint8_t * qs = xb->qs;
const float d = xb->d;
const float neg_d = -d;
const int byte_offset = il * 2; // il*16 bits = il*2 bytes
const uint8_t b0 = qs[byte_offset];
const uint8_t b1 = qs[byte_offset + 1];
float4x4 reg_f;
reg_f[0][0] = select(neg_d, d, bool(b0 & 0x01));
reg_f[0][1] = select(neg_d, d, bool(b0 & 0x02));
reg_f[0][2] = select(neg_d, d, bool(b0 & 0x04));
reg_f[0][3] = select(neg_d, d, bool(b0 & 0x08));
reg_f[1][0] = select(neg_d, d, bool(b0 & 0x10));
reg_f[1][1] = select(neg_d, d, bool(b0 & 0x20));
reg_f[1][2] = select(neg_d, d, bool(b0 & 0x40));
reg_f[1][3] = select(neg_d, d, bool(b0 & 0x80));
reg_f[2][0] = select(neg_d, d, bool(b1 & 0x01));
reg_f[2][1] = select(neg_d, d, bool(b1 & 0x02));
reg_f[2][2] = select(neg_d, d, bool(b1 & 0x04));
reg_f[2][3] = select(neg_d, d, bool(b1 & 0x08));
reg_f[3][0] = select(neg_d, d, bool(b1 & 0x10));
reg_f[3][1] = select(neg_d, d, bool(b1 & 0x20));
reg_f[3][2] = select(neg_d, d, bool(b1 & 0x40));
reg_f[3][3] = select(neg_d, d, bool(b1 & 0x80));
reg = (type4x4) reg_f;
}
template <typename type4>
void dequantize_q1_0_t4(device const block_q1_0 * xb, short il, thread type4 & reg) {
const float d = xb->d;
const float neg_d = -d;
const int base = il * 4;
const uint8_t byte = xb->qs[base / 8];
const int s = base % 8;
float4 reg_f;
reg_f[0] = select(neg_d, d, bool((byte >> (s )) & 1));
reg_f[1] = select(neg_d, d, bool((byte >> (s + 1)) & 1));
reg_f[2] = select(neg_d, d, bool((byte >> (s + 2)) & 1));
reg_f[3] = select(neg_d, d, bool((byte >> (s + 3)) & 1));
reg = (type4) reg_f;
}
template <typename type4x4>
void dequantize_q4_0(device const block_q4_0 * xb, short il, thread type4x4 & reg) {
device const uint16_t * qs = ((device const uint16_t *)xb + 1);
@@ -202,23 +152,6 @@ void dequantize_q4_0_t4(device const block_q4_0 * xb, short il, thread type4 & r
}
}
void quantize_q1_0(device const float * src, device block_q1_0 & dst) {
float sum_abs = 0.0f;
for (int j = 0; j < QK1_0; j++) {
sum_abs += fabs(src[j]);
}
dst.d = sum_abs / QK1_0;
for (int j = 0; j < QK1_0 / 8; j++) {
dst.qs[j] = 0;
}
for (int j = 0; j < QK1_0; j++) {
if (src[j] >= 0.0f) {
dst.qs[j / 8] |= (1 << (j % 8));
}
}
}
void quantize_q4_0(device const float * src, device block_q4_0 & dst) {
#pragma METAL fp math_mode(safe)
float amax = 0.0f; // absolute max
@@ -3183,35 +3116,6 @@ kernel void kernel_group_norm_f32(
}
}
// Q1_0 dot product: dot = d * (2 * Σ(yl[i] where bit=1) - sumy)
inline float block_q_n_dot_y(device const block_q1_0 * qb_curr, float sumy, thread float * yl, int il) {
device const uint8_t * qs = qb_curr->qs + il / 8;
const uint8_t b0 = qs[0];
const uint8_t b1 = qs[1];
float acc = 0.0f;
acc += select(0.0f, yl[ 0], bool(b0 & 0x01));
acc += select(0.0f, yl[ 1], bool(b0 & 0x02));
acc += select(0.0f, yl[ 2], bool(b0 & 0x04));
acc += select(0.0f, yl[ 3], bool(b0 & 0x08));
acc += select(0.0f, yl[ 4], bool(b0 & 0x10));
acc += select(0.0f, yl[ 5], bool(b0 & 0x20));
acc += select(0.0f, yl[ 6], bool(b0 & 0x40));
acc += select(0.0f, yl[ 7], bool(b0 & 0x80));
acc += select(0.0f, yl[ 8], bool(b1 & 0x01));
acc += select(0.0f, yl[ 9], bool(b1 & 0x02));
acc += select(0.0f, yl[10], bool(b1 & 0x04));
acc += select(0.0f, yl[11], bool(b1 & 0x08));
acc += select(0.0f, yl[12], bool(b1 & 0x10));
acc += select(0.0f, yl[13], bool(b1 & 0x20));
acc += select(0.0f, yl[14], bool(b1 & 0x40));
acc += select(0.0f, yl[15], bool(b1 & 0x80));
return qb_curr->d * (2.0f * acc - sumy);
}
// function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i])
// il indicates where the q4 quants begin (0 or QK4_0/4)
// we assume that the yl's have been multiplied with the appropriate scale factor
@@ -3433,85 +3337,6 @@ void mul_vec_q_n_f32_impl(
}
}
template<int nr0, typename args_t>
void kernel_mul_mv_q1_0_f32_impl(
args_t args,
device const char * src0,
device const char * src1,
device char * dst,
threadgroup char * shmem,
uint3 tgpig,
ushort tiisg,
ushort sgitg) {
const short NSG = FC_mul_mv_nsg;
const int nb = args.ne00/QK1_0;
const int r0 = tgpig.x;
const int r1 = tgpig.y;
const int im = tgpig.z;
const int first_row = (r0 * NSG + sgitg) * nr0;
const uint i12 = im%args.ne12;
const uint i13 = im/args.ne12;
const uint64_t offset1 = r1*args.nb11 + (i12)*args.nb12 + (i13)*args.nb13;
device const float * y = (device const float *) (src1 + offset1);
device const block_q1_0 * ax[nr0];
for (int row = 0; row < nr0; ++row) {
const uint64_t offset0 = (first_row + row)*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
ax[row] = (device const block_q1_0 *) ((device char *) src0 + offset0);
}
float yl[16];
float sumf[nr0] = {0.f};
const short ix = (tiisg/8);
const short il = (tiisg%8)*16;
device const float * yb = y + ix*QK1_0 + il;
for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/8) {
float sumy = 0.f;
FOR_UNROLL (short i = 0; i < 16; i++) {
yl[i] = yb[i];
sumy += yb[i];
}
FOR_UNROLL (short row = 0; row < nr0; row++) {
sumf[row] += block_q_n_dot_y(ax[row] + ib, sumy, yl, il);
}
yb += QK1_0 * (N_SIMDWIDTH/8);
}
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < nr0; ++row) {
const float tot = simd_sum(sumf[row]);
if (tiisg == 0 && first_row + row < args.ne01) {
dst_f32[first_row + row] = tot;
}
}
}
[[host_name("kernel_mul_mv_q1_0_f32")]]
kernel void kernel_mul_mv_q1_0_f32(
constant ggml_metal_kargs_mul_mv & args,
device const char * src0,
device const char * src1,
device char * dst,
uint3 tgpig[[threadgroup_position_in_grid]],
ushort tiisg[[thread_index_in_simdgroup]],
ushort sgitg[[simdgroup_index_in_threadgroup]]) {
kernel_mul_mv_q1_0_f32_impl<N_R0_Q1_0, constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg);
}
kernel void kernel_mul_mv_q4_0_f32(
constant ggml_metal_kargs_mul_mv & args,
device const char * src0,
@@ -3904,11 +3729,6 @@ template [[host_name("kernel_mul_mv_ext_bf16_f32_r1_4")]] kernel mul_mv_ext_q4
template [[host_name("kernel_mul_mv_ext_bf16_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, bfloat4, 4, dequantize_bf16_t4>;
#endif
template [[host_name("kernel_mul_mv_ext_q1_0_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_q1_0, 128, dequantize_q1_0_t4>;
template [[host_name("kernel_mul_mv_ext_q1_0_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_q1_0, 128, dequantize_q1_0_t4>;
template [[host_name("kernel_mul_mv_ext_q1_0_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_q1_0, 128, dequantize_q1_0_t4>;
template [[host_name("kernel_mul_mv_ext_q1_0_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, block_q1_0, 128, dequantize_q1_0_t4>;
template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_q4_0, 32, dequantize_q4_0_t4>;
template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_q4_0, 32, dequantize_q4_0_t4>;
template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_q4_0, 32, dequantize_q4_0_t4>;
@@ -7313,7 +7133,6 @@ kernel void kernel_cpy_f32_q(
typedef decltype(kernel_cpy_f32_q<QK8_0, block_q8_0, quantize_q8_0>) cpy_f_q_t;
template [[host_name("kernel_cpy_f32_q8_0")]] kernel cpy_f_q_t kernel_cpy_f32_q<QK8_0, block_q8_0, quantize_q8_0>;
template [[host_name("kernel_cpy_f32_q1_0")]] kernel cpy_f_q_t kernel_cpy_f32_q<QK1_0, block_q1_0, quantize_q1_0>;
template [[host_name("kernel_cpy_f32_q4_0")]] kernel cpy_f_q_t kernel_cpy_f32_q<QK4_0, block_q4_0, quantize_q4_0>;
template [[host_name("kernel_cpy_f32_q4_1")]] kernel cpy_f_q_t kernel_cpy_f32_q<QK4_1, block_q4_1, quantize_q4_1>;
template [[host_name("kernel_cpy_f32_q5_0")]] kernel cpy_f_q_t kernel_cpy_f32_q<QK5_0, block_q5_0, quantize_q5_0>;
@@ -7354,14 +7173,12 @@ kernel void kernel_cpy_q_f32(
typedef decltype(kernel_cpy_q_f32<float4x4, block_q4_0, 2, dequantize_q4_0>) cpy_q_f_t;
template [[host_name("kernel_cpy_q1_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q1_0, 8, dequantize_q1_0>;
template [[host_name("kernel_cpy_q4_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q4_0, 2, dequantize_q4_0>;
template [[host_name("kernel_cpy_q4_1_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q4_1, 2, dequantize_q4_1>;
template [[host_name("kernel_cpy_q5_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q5_0, 2, dequantize_q5_0>;
template [[host_name("kernel_cpy_q5_1_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q5_1, 2, dequantize_q5_1>;
template [[host_name("kernel_cpy_q8_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q8_0, 2, dequantize_q8_0>;
template [[host_name("kernel_cpy_q1_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q1_0, 8, dequantize_q1_0>;
template [[host_name("kernel_cpy_q4_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q4_0, 2, dequantize_q4_0>;
template [[host_name("kernel_cpy_q4_1_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q4_1, 2, dequantize_q4_1>;
template [[host_name("kernel_cpy_q5_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q5_0, 2, dequantize_q5_0>;
@@ -9959,7 +9776,6 @@ template [[host_name("kernel_get_rows_bf16")]] kernel get_rows_f_t kernel_get_ro
typedef decltype(kernel_get_rows_q<block_q4_0, 2, dequantize_q4_0>) get_rows_q_t;
template [[host_name("kernel_get_rows_q1_0")]] kernel get_rows_q_t kernel_get_rows_q<block_q1_0, 8, dequantize_q1_0>;
template [[host_name("kernel_get_rows_q4_0")]] kernel get_rows_q_t kernel_get_rows_q<block_q4_0, 2, dequantize_q4_0>;
template [[host_name("kernel_get_rows_q4_1")]] kernel get_rows_q_t kernel_get_rows_q<block_q4_1, 2, dequantize_q4_1>;
template [[host_name("kernel_get_rows_q5_0")]] kernel get_rows_q_t kernel_get_rows_q<block_q5_0, 2, dequantize_q5_0>;
@@ -10022,7 +9838,6 @@ template [[host_name("kernel_mul_mm_f16_f32")]] kernel mul_mm_t kernel_mul_m
#if defined(GGML_METAL_HAS_BF16)
template [[host_name("kernel_mul_mm_bf16_f32")]] kernel mul_mm_t kernel_mul_mm<bfloat, bfloat4x4, simdgroup_bfloat8x8, bfloat, bfloat2x4, simdgroup_bfloat8x8, bfloat4x4, 1, dequantize_bf16, bfloat, bfloat4x4, float, float2x4>;
#endif
template [[host_name("kernel_mul_mm_q1_0_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q1_0, 8, dequantize_q1_0, float, float4x4, float, float2x4>;
template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q4_0, 2, dequantize_q4_0, float, float4x4, float, float2x4>;
template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q4_1, 2, dequantize_q4_1, float, float4x4, float, float2x4>;
template [[host_name("kernel_mul_mm_q5_0_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q5_0, 2, dequantize_q5_0, float, float4x4, float, float2x4>;
@@ -10046,7 +9861,6 @@ template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mul_mm_t kernel_mul_m
template [[host_name("kernel_mul_mm_f32_f16")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, float4x4, 1, dequantize_f32, float, float4x4, half, half2x4>;
template [[host_name("kernel_mul_mm_f16_f16")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, half4x4, 1, dequantize_f16, half, half4x4, half, half2x4>;
template [[host_name("kernel_mul_mm_q1_0_f16")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q1_0, 8, dequantize_q1_0, float, float4x4, half, half2x4>;
template [[host_name("kernel_mul_mm_q4_0_f16")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q4_0, 2, dequantize_q4_0, float, float4x4, half, half2x4>;
template [[host_name("kernel_mul_mm_q4_1_f16")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q4_1, 2, dequantize_q4_1, float, float4x4, half, half2x4>;
template [[host_name("kernel_mul_mm_q5_0_f16")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q5_0, 2, dequantize_q5_0, float, float4x4, half, half2x4>;
@@ -10079,7 +9893,6 @@ template [[host_name("kernel_mul_mm_id_f16_f32")]] kernel mul_mm_id kernel_m
#if defined(GGML_METAL_HAS_BF16)
template [[host_name("kernel_mul_mm_id_bf16_f32")]] kernel mul_mm_id kernel_mul_mm_id<bfloat, bfloat4x4, simdgroup_bfloat8x8, bfloat, bfloat2x4, simdgroup_bfloat8x8, bfloat4x4, 1, dequantize_bf16, bfloat, bfloat4x4, float, float2x4>;
#endif
template [[host_name("kernel_mul_mm_id_q1_0_f32")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q1_0, 8, dequantize_q1_0, float, float4x4, float, float2x4>;
template [[host_name("kernel_mul_mm_id_q4_0_f32")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q4_0, 2, dequantize_q4_0, float, float4x4, float, float2x4>;
template [[host_name("kernel_mul_mm_id_q4_1_f32")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q4_1, 2, dequantize_q4_1, float, float4x4, float, float2x4>;
template [[host_name("kernel_mul_mm_id_q5_0_f32")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q5_0, 2, dequantize_q5_0, float, float4x4, float, float2x4>;
@@ -10103,7 +9916,6 @@ template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mul_mm_id kernel_m
template [[host_name("kernel_mul_mm_id_f32_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, float4x4, 1, dequantize_f32, float, float4x4, half, half2x4>;
template [[host_name("kernel_mul_mm_id_f16_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, half4x4, 1, dequantize_f16, half, half4x4, half, half2x4>;
template [[host_name("kernel_mul_mm_id_q1_0_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q1_0, 8, dequantize_q1_0, float, float4x4, half, half2x4>;
template [[host_name("kernel_mul_mm_id_q4_0_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q4_0, 2, dequantize_q4_0, float, float4x4, half, half2x4>;
template [[host_name("kernel_mul_mm_id_q4_1_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q4_1, 2, dequantize_q4_1, float, float4x4, half, half2x4>;
template [[host_name("kernel_mul_mm_id_q5_0_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q5_0, 2, dequantize_q5_0, float, float4x4, half, half2x4>;
@@ -10258,7 +10070,6 @@ template [[host_name("kernel_mul_mv_id_bf16_f32_4")]] kernel kernel_mul_mv_id_4
template [[host_name("kernel_mul_mv_id_q8_0_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q8_0_f32_impl<N_R0_Q8_0>>>;
template [[host_name("kernel_mul_mv_id_q1_0_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q1_0_f32_impl<N_R0_Q1_0>>>;
template [[host_name("kernel_mul_mv_id_q4_0_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<mul_vec_q_n_f32_impl<block_q4_0, N_R0_Q4_0>>>;
template [[host_name("kernel_mul_mv_id_q4_1_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<mul_vec_q_n_f32_impl<block_q4_1, N_R0_Q4_1>>>;
template [[host_name("kernel_mul_mv_id_q5_0_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<mul_vec_q_n_f32_impl<block_q5_0, N_R0_Q5_0>>>;

View File

@@ -90,8 +90,6 @@ set(GGML_OPENCL_KERNELS
mul_mv_q4_1_f32_flat
mul_mv_q4_k_f32
mul_mv_q4_k_f32_flat
mul_mv_q5_k_f32
mul_mv_q5_k_f32_flat
mul_mv_q6_k_f32
mul_mv_q6_k_f32_flat
mul_mv_q8_0_f32
@@ -111,7 +109,6 @@ set(GGML_OPENCL_KERNELS
mul_mm_q4_1_f32_l4_lm
mul_mm_q8_0_f32_l4_lm
mul_mm_q4_k_f32_l4_lm
mul_mm_q5_k_f32_l4_lm
mul_mm_q6_k_f32_l4_lm
mul_mm_q8_0_f32_8x4
gemv_noshuffle_q4_1_f32

View File

@@ -541,15 +541,12 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_convert_block_q4_K_noshuffle;
cl_kernel kernel_restore_block_q4_K_noshuffle;
cl_kernel kernel_convert_block_q4_K, kernel_restore_block_q4_K;
cl_kernel kernel_convert_block_q5_K, kernel_restore_block_q5_K;
cl_kernel kernel_convert_block_q6_K, kernel_restore_block_q6_K;
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
cl_kernel kernel_mul_mv_q4_1_f32;
cl_kernel kernel_mul_mv_q4_1_f32_flat;
cl_kernel kernel_mul_mv_q4_K_f32;
cl_kernel kernel_mul_mv_q4_K_f32_flat;
cl_kernel kernel_mul_mv_q5_K_f32;
cl_kernel kernel_mul_mv_q5_K_f32_flat;
cl_kernel kernel_mul_mv_q6_K_f32;
cl_kernel kernel_mul_mv_q6_K_f32_flat;
cl_kernel kernel_mul_mv_mxfp4_f32, kernel_mul_mv_mxfp4_f32_flat;
@@ -590,7 +587,6 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_mul_mm_q4_1_f32_l4_lm;
cl_kernel kernel_mul_mm_q8_0_f32_l4_lm;
cl_kernel kernel_mul_mm_q4_k_f32_l4_lm;
cl_kernel kernel_mul_mm_q5_k_f32_l4_lm;
cl_kernel kernel_mul_mm_q6_k_f32_l4_lm;
std::vector<ProfilingInfo> profiling_info;
@@ -942,8 +938,6 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
CL_CHECK((backend_ctx->kernel_restore_block_q4_K = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_K", &err), err));
CL_CHECK((backend_ctx->kernel_convert_block_q4_K_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_K_noshuffle", &err), err));
CL_CHECK((backend_ctx->kernel_restore_block_q4_K_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_K_noshuffle", &err), err));
CL_CHECK((backend_ctx->kernel_convert_block_q5_K = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_K", &err), err));
CL_CHECK((backend_ctx->kernel_restore_block_q5_K = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_K", &err), err));
CL_CHECK((backend_ctx->kernel_convert_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_K", &err), err));
CL_CHECK((backend_ctx->kernel_restore_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q6_K", &err), err));
CL_CHECK((backend_ctx->kernel_convert_block_q6_K_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_K_noshuffle", &err), err));
@@ -1255,39 +1249,6 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
GGML_LOG_CONT(".");
}
// mul_mv_q5_k_f32
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "mul_mv_q5_k_f32.cl.h"
};
#else
const std::string kernel_src = read_file("mul_mv_q5_k_f32.cl");
#endif
cl_program prog =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_mul_mv_q5_K_f32 = clCreateKernel(prog, "kernel_mul_mv_q5_K_f32", &err), err));
CL_CHECK(clReleaseProgram(prog));
GGML_LOG_CONT(".");
}
// mul_mv_q5_k_f32_flat
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "mul_mv_q5_k_f32_flat.cl.h"
};
#else
const std::string kernel_src = read_file("mul_mv_q5_k_f32_flat.cl");
#endif
cl_program prog =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_mul_mv_q5_K_f32_flat = clCreateKernel(prog, "kernel_mul_mv_q5_K_f32_flat", &err), err));
CL_CHECK(clReleaseProgram(prog));
}
// mul_mv_q6_k_f32
{
#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -1595,23 +1556,6 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
GGML_LOG_CONT(".");
}
// mul_mm_q5_k_f32_l4_lm
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "mul_mm_q5_k_f32_l4_lm.cl.h"
};
#else
const std::string kernel_src = read_file("mul_mm_q5_k_f32_l4_lm.cl");
#endif
cl_program prog =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_mul_mm_q5_k_f32_l4_lm = clCreateKernel(prog, "kernel_mul_mm_q5_k_f32_l4_lm", &err), err));
CL_CHECK(clReleaseProgram(prog));
GGML_LOG_CONT(".");
}
// mul_mm_f16_f32_kq_kqv
{
#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -3586,58 +3530,6 @@ struct ggml_tensor_extra_cl_q4_K {
}
};
struct ggml_tensor_extra_cl_q5_K {
// Lower 4 bits of quantized weights.
cl_mem q = nullptr;
// Upper 1 bit of quantized weights.
cl_mem qh = nullptr;
// Scales for each block.
cl_mem s = nullptr;
// Scales for each super block.
cl_mem d = nullptr;
// Min for each super block.
cl_mem dm = nullptr;
size_t size_q = 0;
size_t size_qh = 0;
size_t size_s = 0;
size_t size_d = 0;
size_t size_dm = 0;
~ggml_tensor_extra_cl_q5_K() {
reset();
}
void reset() {
if (q != nullptr) {
CL_CHECK(clReleaseMemObject(q));
q = nullptr;
}
if (qh != nullptr) {
CL_CHECK(clReleaseMemObject(qh));
qh = nullptr;
}
if (s != nullptr) {
CL_CHECK(clReleaseMemObject(s));
s = nullptr;
}
if (d != nullptr) {
CL_CHECK(clReleaseMemObject(d));
d = nullptr;
}
if (dm != nullptr) {
CL_CHECK(clReleaseMemObject(dm));
dm = nullptr;
}
size_q = 0;
size_qh = 0;
size_s = 0;
size_d = 0;
size_dm = 0;
}
};
struct ggml_tensor_extra_cl_q6_K {
// Lower 4 bits of quantized weights.
cl_mem ql = nullptr;
@@ -4053,7 +3945,6 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
} else if (op->src[0]->type == GGML_TYPE_Q4_0 || op->src[0]->type == GGML_TYPE_Q4_1 ||
op->src[0]->type == GGML_TYPE_MXFP4 ||
op->src[0]->type == GGML_TYPE_Q4_K ||
op->src[0]->type == GGML_TYPE_Q5_K ||
op->src[0]->type == GGML_TYPE_Q6_K) {
return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
} else if (op->src[0]->type == GGML_TYPE_Q8_0) {
@@ -4172,8 +4063,6 @@ static ggml_backend_i ggml_backend_opencl_i = {
/* .set_tensor_async = */ NULL, /* ggml_backend_opencl_set_tensor_async */
/* .get_tensor_async = */ NULL, /* ggml_backend_opencl_get_tensor_async */
/* .cpy_tensor_async = */ NULL, /* ggml_backend_opencl_cpy_tensor_async */
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .synchronize = */ ggml_backend_opencl_synchronize,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,
@@ -4262,12 +4151,6 @@ struct ggml_backend_opencl_buffer_context {
for (ggml_tensor_extra_cl_q6_K * e : temp_tensor_extras_q6_K_in_use) {
delete e;
}
for (ggml_tensor_extra_cl_q5_K * e : temp_tensor_extras_q5_K) {
delete e;
}
for (ggml_tensor_extra_cl_q5_K * e : temp_tensor_extras_q5_K_in_use) {
delete e;
}
}
ggml_tensor_extra_cl * ggml_opencl_alloc_temp_tensor_extra() {
@@ -4360,21 +4243,6 @@ struct ggml_backend_opencl_buffer_context {
return extra;
}
ggml_tensor_extra_cl_q5_K * ggml_opencl_alloc_temp_tensor_extra_q5_K() {
ggml_tensor_extra_cl_q5_K * extra;
if (temp_tensor_extras_q5_K.empty()) {
extra = new ggml_tensor_extra_cl_q5_K();
} else {
extra = temp_tensor_extras_q5_K.back();
temp_tensor_extras_q5_K.pop_back();
}
temp_tensor_extras_q5_K_in_use.push_back(extra);
extra->reset();
return extra;
}
ggml_tensor_extra_cl_q6_K * ggml_opencl_alloc_temp_tensor_extra_q6_K() {
ggml_tensor_extra_cl_q6_K * extra;
if (temp_tensor_extras_q6_K.empty()) {
@@ -4421,11 +4289,6 @@ struct ggml_backend_opencl_buffer_context {
}
temp_tensor_extras_q4_K_in_use.clear();
for (ggml_tensor_extra_cl_q5_K * e : temp_tensor_extras_q5_K_in_use) {
temp_tensor_extras_q5_K.push_back(e);
}
temp_tensor_extras_q5_K_in_use.clear();
for (ggml_tensor_extra_cl_q6_K * e : temp_tensor_extras_q6_K_in_use) {
temp_tensor_extras_q6_K.push_back(e);
}
@@ -4449,8 +4312,6 @@ struct ggml_backend_opencl_buffer_context {
std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0_in_use;
std::vector<ggml_tensor_extra_cl_q4_K *> temp_tensor_extras_q4_K;
std::vector<ggml_tensor_extra_cl_q4_K *> temp_tensor_extras_q4_K_in_use;
std::vector<ggml_tensor_extra_cl_q5_K *> temp_tensor_extras_q5_K;
std::vector<ggml_tensor_extra_cl_q5_K *> temp_tensor_extras_q5_K_in_use;
std::vector<ggml_tensor_extra_cl_q6_K *> temp_tensor_extras_q6_K;
std::vector<ggml_tensor_extra_cl_q6_K *> temp_tensor_extras_q6_K_in_use;
@@ -5289,97 +5150,6 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
return;
}
if (tensor->type == GGML_TYPE_Q5_K) {
ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
// Allocate the new extra and create aliases from the original.
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
ggml_tensor_extra_cl_q5_K * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q5_K();
size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2;
size_t size_qh = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/8;
size_t size_s = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*(3*ggml_blck_size(tensor->type)/64);
size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
size_t size_dm = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
GGML_ASSERT(size_q + size_qh + size_s + size_d + size_dm == ggml_nbytes(tensor) &&
"Incorrect tensor size");
cl_int err;
cl_mem data_device;
CL_CHECK((data_device = clCreateBuffer(context, CL_MEM_READ_WRITE, ggml_nbytes(tensor), NULL, &err), err));
CL_CHECK(clEnqueueWriteBuffer(queue, data_device, CL_TRUE, 0, ggml_nbytes(tensor), data, 0, NULL, NULL));
cl_buffer_region region;
// Create subbuffer for d.
region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
region.size = size_d;
extra->d = clCreateSubBuffer(
extra_orig->data_device, CL_MEM_READ_WRITE,
CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
CL_CHECK(err);
auto previous_origin = region.origin;
// Create subbuffer for dm.
region.origin = align_to(previous_origin + size_d, backend_ctx->alignment);
region.size = size_dm;
extra->dm = clCreateSubBuffer(
extra_orig->data_device, CL_MEM_READ_WRITE,
CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
CL_CHECK(err);
previous_origin = region.origin;
// Create subbuffer for s.
region.origin = align_to(previous_origin + size_dm, backend_ctx->alignment);
region.size = size_s;
extra->s = clCreateSubBuffer(
extra_orig->data_device, CL_MEM_READ_WRITE,
CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
CL_CHECK(err);
previous_origin = region.origin;
// Create subbuffer for q (lower 4 bits)
region.origin = align_to(previous_origin + size_s, backend_ctx->alignment);
region.size = size_q;
extra->q = clCreateSubBuffer(
extra_orig->data_device, CL_MEM_READ_WRITE,
CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
CL_CHECK(err);
previous_origin = region.origin;
// Create subbuffer for qh (upper 1 bit)
region.origin = align_to(previous_origin + size_q, backend_ctx->alignment);
region.size = size_qh;
CL_CHECK((extra->qh = clCreateSubBuffer(extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &region, &err), err));
CL_CHECK(err);
cl_kernel kernel = backend_ctx->kernel_convert_block_q5_K;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->s));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->d));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extra->dm));
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
size_t local_work_size[] = {64, 1, 1};
cl_event evt;
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
CL_CHECK(clWaitForEvents(1, &evt));
CL_CHECK(clReleaseMemObject(data_device));
extra->size_q = size_q;
extra->size_qh = size_qh;
extra->size_s = size_s;
extra->size_d = size_d;
extra->size_dm = size_dm;
tensor->extra = extra;
return;
}
if (tensor->type == GGML_TYPE_Q6_K) {
ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
@@ -5886,35 +5656,6 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
CL_CHECK(clReleaseMemObject(data_device));
return;
}
if (tensor->type == GGML_TYPE_Q5_K) {
ggml_tensor_extra_cl_q5_K * extra = (ggml_tensor_extra_cl_q5_K *)tensor->extra;
cl_int err;
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
ggml_nbytes(tensor), NULL, &err);
CL_CHECK(err);
cl_kernel kernel = backend_ctx->kernel_restore_block_q5_K;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qh));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->s));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->dm));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &data_device));
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
size_t local_work_size[] = {1, 1, 1};
cl_event evt;
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
global_work_size, local_work_size, 0, NULL, &evt));
CL_CHECK(clWaitForEvents(1, &evt));
CL_CHECK(clEnqueueReadBuffer(
queue, data_device, CL_TRUE, offset,
size, data, 0, NULL, NULL));
CL_CHECK(clReleaseMemObject(data_device));
return;
}
if (tensor->type == GGML_TYPE_Q6_K) {
ggml_tensor_extra_cl_q6_K * extra = (ggml_tensor_extra_cl_q6_K *)tensor->extra;
@@ -6037,8 +5778,6 @@ static ggml_backend_buffer_i ggml_backend_opencl_buffer_interface = {
/* .memset_tensor = */ NULL,
/* .set_tensor = */ ggml_backend_opencl_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_opencl_buffer_get_tensor,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ NULL,
/* .clear = */ ggml_backend_opencl_buffer_clear,
/* .reset = */ ggml_backend_opencl_buffer_reset,
@@ -10478,7 +10217,6 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
ggml_tensor_extra_cl_mxfp4 * extra0_mxfp4 = (ggml_tensor_extra_cl_mxfp4 *)src0->extra;
ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra;
ggml_tensor_extra_cl_q4_K * extra0_q4_K = (ggml_tensor_extra_cl_q4_K *)src0->extra;
ggml_tensor_extra_cl_q5_K * extra0_q5_K = (ggml_tensor_extra_cl_q5_K *)src0->extra;
ggml_tensor_extra_cl_q6_K * extra0_q6_K = (ggml_tensor_extra_cl_q6_K *)src0->extra;
#endif
@@ -11183,51 +10921,6 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
return;
}
case GGML_TYPE_Q5_K: {
if (ne11 < 32) {
break;
}
if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1)) {
break;
}
kernel = backend_ctx->kernel_mul_mm_q5_k_f32_l4_lm;
nth0 = 128; // calculated as (BM*BN)/(TM*TN)
int batch_stride_a = ne00*ne01;
int batch_stride_b = ne10*ne11;
int batch_stride_d = ne0*ne1;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q5_K->q));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q5_K->qh));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q5_K->s));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q5_K->d));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra0_q5_K->dm));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne02));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne11));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne12));
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10)); // stride_a
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne10)); // stride_b
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne01)); // stride_d
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &batch_stride_a));
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &batch_stride_b));
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &batch_stride_d));
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &r2));
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &r3));
// 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed.
size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
size_t local_work_size[] = {(size_t)nth0, 1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
return;
}
case GGML_TYPE_Q6_K: {
if (ne11 < 32) {
break;
@@ -11745,81 +11438,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
#endif // GGML_OPENCL_SOA_Q
break;
}
case GGML_TYPE_Q5_K: {
#ifdef GGML_OPENCL_SOA_Q
kernel = backend_ctx->kernel_mul_mv_q5_K_f32_flat;
if (backend_ctx->gpu_family == INTEL) {
nth0 = 16;
nth1 = 1;
ndst = 4;
} else if (backend_ctx->gpu_family == ADRENO) {
nth0 = 64;
nth1 = 2;
ndst = 16;
} else {
GGML_ASSERT(false && "TODO: Unknown GPU");
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q5_K->q));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q5_K->qh));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q5_K->s));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q5_K->d));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra0_q5_K->dm));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &offset1));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne12));
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne0));
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &ne1));
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &r2));
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &r3));
#else
kernel = backend_ctx->kernel_mul_mv_q5_K_f32;
if (backend_ctx->gpu_family == INTEL) {
nth0 = 16;
nth1 = 1;
ndst = 4;
} else if (backend_ctx->gpu_family == ADRENO) {
nth0 = 64;
nth1 = 1;
ndst = 4;
} else {
GGML_ASSERT(false && "TODO: Unknown GPU");
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne12));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb11));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb12));
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb13));
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne0));
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne1));
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2));
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3));
#endif // GGML_OPENCL_SOA_Q
break;
}
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
#ifdef GGML_OPENCL_SOA_Q
kernel = backend_ctx->kernel_mul_mv_q6_K_f32_flat;
@@ -11987,10 +11606,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
} else if (src0t == GGML_TYPE_Q3_K) {
GGML_ASSERT(false && "not implemented");
} else if (src0t == GGML_TYPE_Q5_K) {
size_t global_work_size[] = {(size_t)(ne01+ndst*nth1-1)/(ndst*nth1)*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
GGML_ASSERT(false && "not implemented");
} else if (src0t == GGML_TYPE_Q6_K) {
size_t global_work_size[] = {(size_t)(ne01+ndst*nth1-1)/(ndst*nth1)*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};

View File

@@ -66,17 +66,6 @@ struct block_q4_K {
uchar q[QK_K / 2]; // nibbles / quants
};
//------------------------------------------------------------------------------
// block_q5_k
//------------------------------------------------------------------------------
struct block_q5_K {
half d; // delta
half dm; // min
uchar s[K_SCALE_SIZE];
uchar qh[QK_K / 8];
uchar qs[QK_K / 2]; // nibbles / quants
};
//------------------------------------------------------------------------------
// block_q6_K
//------------------------------------------------------------------------------
@@ -557,71 +546,6 @@ kernel void kernel_restore_block_q4_K_noshuffle(
}
}
//------------------------------------------------------------------------------
// kernel_convert_block_q5_K
// Convert the block_q5_K format to 5 separate arrays (AOS -> SOA).
// Each thread processes a super block.
//------------------------------------------------------------------------------
kernel void kernel_convert_block_q5_K(
global struct block_q5_K * src0,
global uchar * dst_q,
global uchar * dst_qh,
global uchar * dst_s,
global half * dst_d,
global half * dst_dm
) {
global struct block_q5_K * b = (global struct block_q5_K *) src0 + get_global_id(0);
global uchar * q = (global uchar *) dst_q + QK_K/2*get_global_id(0);
global uchar * qh = (global uchar *) dst_qh + QK_K/8*get_global_id(0);
global uchar * s = (global uchar *) dst_s + K_SCALE_SIZE*get_global_id(0);
global half * d = (global half *) dst_d + get_global_id(0);
global half * dm = (global half *) dst_dm + get_global_id(0);
*d = b->d;
*dm = b->dm;
for (int i = 0; i < QK_K/2; ++i) {
q[i] = b->qs[i];
}
for (int i = 0; i < QK_K/8; ++i) {
qh[i] = b->qh[i];
}
for (int i = 0; i < K_SCALE_SIZE; ++i) {
s[i] = b->s[i];
}
}
// Restore block_q5_K from flattened arrays.
// Each thread processes a super block.
kernel void kernel_restore_block_q5_K(
global uchar * src_q,
global uchar * src_qh,
global uchar * src_s,
global half * src_d,
global half * src_dm,
global struct block_q5_K * dst
) {
global struct block_q5_K * b = (global struct block_q5_K *) dst + get_global_id(0);
global uchar * q = (global uchar *) src_q + QK_K/2*get_global_id(0);
global uchar * qh = (global uchar *) src_qh + QK_K/8*get_global_id(0);
global uchar * s = (global uchar *) src_s + K_SCALE_SIZE*get_global_id(0);
global half * d = (global half *) src_d + get_global_id(0);
global half * dm = (global half *) src_dm + get_global_id(0);
b->d = *d;
b->dm = *dm;
for (int i = 0; i < QK_K/2; ++i) {
b->qs[i] = q[i];
}
for (int i = 0; i < QK_K/8; ++i) {
b->qh[i] = qh[i];
}
for (int i = 0; i < K_SCALE_SIZE; ++i) {
b->s[i] = s[i];
}
}
//------------------------------------------------------------------------------
// kernel_convert_block_q6_K
// Convert the block_q6_K format to 3 separate arrays (AOS -> SOA).

View File

@@ -1,192 +0,0 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#define LOAD_VEC_A 4
#define LOAD_VEC_B 4
#define BM 64
#define BN 64
#define BK 32
#define TM 4
#define TN 8
kernel void kernel_mul_mm_q5_k_f32_l4_lm(
global uchar4 * src0_q,
global uchar * src0_qh,
global uchar * src0_s,
global half * src0_d,
global half * src0_dm,
global float4 * src1,
ulong offset1,
global float * dst,
ulong offsetd,
int ne00,
int ne01,
int ne02,
int ne11,
int ne12,
int stride_a,
int stride_b,
int stride_d,
int batch_stride_a,
int batch_stride_b,
int batch_stride_d,
int r2,
int r3
) {
src1 = (global float4*)((global char*)src1 + offset1);
dst = (global float *)((global char*)dst + offsetd);
local float buf_a[BM * BK];
local float buf_b[BN * BK];
const int batch_idx = get_global_id(2);
const int i13 = batch_idx / ne12;
const int i12 = batch_idx % ne12;
const int i03 = i13 / r3;
const int i02 = i12 / r2;
const int batch_idx_a = i03 * ne02 + i02;
const int ir = get_group_id(0);
const int ic = get_group_id(1);
const int tid = get_local_id(0);
const int th_r = tid % (BM / TM);
const int th_c = tid / (BM / TM);
const int loadr_a = get_local_id(0) % (BK / LOAD_VEC_A);
const int loadc_a = get_local_id(0) / (BK / LOAD_VEC_A);
const int loadr_b = get_local_id(0) % (BK / LOAD_VEC_B);
const int loadc_b = get_local_id(0) / (BK / LOAD_VEC_B);
const int loadstride_a = get_local_size(0) * LOAD_VEC_A / BK;
const int loadstride_b = get_local_size(0) * LOAD_VEC_B / BK;
int pos_a = (batch_idx_a * batch_stride_a + ir * BM * stride_a) / LOAD_VEC_A;
int pos_b = (batch_idx * batch_stride_b + ic * BN * stride_b) / LOAD_VEC_B;
float sums[TM * TN];
float cache_a[TM];
float cache_b[TN];
for (int i = 0; i < TM * TN; i++) {
sums[i] = 0.0f;
}
for (int block = 0; block < ne00; block += BK) {
for (int l = 0; l < BM; l += loadstride_a) {
if (ir*BM + loadc_a + l < ne01) {
int idx = pos_a + (loadc_a + l) * stride_a / LOAD_VEC_A + loadr_a;
int ib = idx / 64;
int iqs = (idx % 64) * 2;
int n = iqs / 32;
int b = (iqs % 32) / 16;
int is = 2 * n + b;
int qsi = n * 32 + (iqs % 16) * 2;
global uchar * scales = src0_s + ib * 12;
int scidx0 = (is < 4) ? is : (is + 4);
int scidx1 = (is < 4) ? is : (is - 4);
int scidxmask1 = (is < 4) ? 0x30 : 0xC0;
int scidxshift1 = (is < 4) ? 0 : 2;
int mbidx0 = is + 4;
int mbidx1 = (is < 4) ? is + 4 : is;
int mbidxmask0 = (is < 4) ? 0xF : 0xF0;
int mbidxshift0 = (is < 4) ? 0 : 4;
int mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
int mbidxshift1 = (is < 4) ? 0 : 2;
uchar sc = (scales[scidx0] & 0xF) | ((scales[scidx1] & scidxmask1) >> scidxshift1);
uchar mbyte = ((scales[mbidx0] & mbidxmask0) >> mbidxshift0) | ((scales[mbidx1] & mbidxmask1) >> mbidxshift1);
float d = (float)src0_d[ib] * (float)sc;
float m = -(float)src0_dm[ib] * (float)mbyte;
int qh_base = (iqs % 16) * 2;
int bit_pos = 2*n + b;
uchar h0 = (src0_qh[ib*32 + qh_base + 0] >> bit_pos) & 1;
uchar h1 = (src0_qh[ib*32 + qh_base + 1] >> bit_pos) & 1;
uchar h2 = (src0_qh[ib*32 + qh_base + 2] >> bit_pos) & 1;
uchar h3 = (src0_qh[ib*32 + qh_base + 3] >> bit_pos) & 1;
global uchar4 * qs = src0_q + ib*32 + (qsi >> 2);
uchar4 q = *qs;
float4 v1 = (convert_float4((uchar4)(
((q.s0 >> (b * 4))&0x0F) | (h0 << 4),
((q.s1 >> (b * 4))&0x0F) | (h1 << 4),
((q.s2 >> (b * 4))&0x0F) | (h2 << 4),
((q.s3 >> (b * 4))&0x0F) | (h3 << 4)
)))*d + m;
buf_a[(loadr_a * LOAD_VEC_A + 0) * BM + loadc_a + l] = v1.s0;
buf_a[(loadr_a * LOAD_VEC_A + 1) * BM + loadc_a + l] = v1.s1;
buf_a[(loadr_a * LOAD_VEC_A + 2) * BM + loadc_a + l] = v1.s2;
buf_a[(loadr_a * LOAD_VEC_A + 3) * BM + loadc_a + l] = v1.s3;
} else {
buf_a[(loadr_a * LOAD_VEC_A + 0) * BM + loadc_a + l] = 0.0f;
buf_a[(loadr_a * LOAD_VEC_A + 1) * BM + loadc_a + l] = 0.0f;
buf_a[(loadr_a * LOAD_VEC_A + 2) * BM + loadc_a + l] = 0.0f;
buf_a[(loadr_a * LOAD_VEC_A + 3) * BM + loadc_a + l] = 0.0f;
}
}
for (int l = 0; l < BN; l += loadstride_b) {
if (ic*BN + loadc_b + l < ne11) {
int idx = pos_b + (loadc_b + l) * stride_b / LOAD_VEC_B + loadr_b;
buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = src1[idx].s0;
buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = src1[idx].s1;
buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = src1[idx].s2;
buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = src1[idx].s3;
} else {
buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = 0.0f;
buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = 0.0f;
buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = 0.0f;
buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = 0.0f;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
pos_a += BK / LOAD_VEC_A;
pos_b += BK / LOAD_VEC_B;
for (int i = 0; i < BK; i++) {
for (int j = 0; j < TM; j++) {
cache_a[j] = buf_a[(i) * BM + th_r * TM + j];
}
for (int j = 0; j < TN; j++) {
cache_b[j] = buf_b[(i) * BN + th_c * TN + j];
}
for (int cc = 0; cc < TN; cc++) {
for (int cr = 0; cr < TM; cr++) {
const int sums_idx = cc*TM + cr;
sums[sums_idx] = mad(cache_a[cr], cache_b[cc], sums[sums_idx]);
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
const int dr = ir * BM + th_r * TM;
const int dc = ic * BN + th_c * TN;
const int offsets = batch_idx * batch_stride_d;
for (int cc = 0; cc < TN; cc++) {
for (int cr = 0; cr < TM; cr++) {
if (dr + cr < ne01 && dc + cc < ne11) {
dst[offsets + (dc + cc) * stride_d + dr + cr] = sums[cc * TM + cr];
}
}
}
}

View File

@@ -1,187 +0,0 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#ifdef cl_intel_subgroups
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
#else
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
#endif
#ifdef cl_intel_required_subgroup_size
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
#define INTEL_GPU 1
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
#elif defined(cl_qcom_reqd_sub_group_size)
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
#define ADRENO_GPU 1
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
#endif
#define QK_K 256
#define K_SCALE_SIZE 12
typedef struct {
half d; // super-block scale for quantized scales
half dmin; // super-block scale for quantized mins
uchar scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uchar qh[QK_K/8]; // quants, high bit (1 bit per value, packed 8 per byte)
uchar qs[QK_K/2]; // quants, low 4 bits (2 values per byte)
} block_q5_K;
#undef N_DST
#undef N_SIMDGROUP
#undef N_SIMDWIDTH
#ifdef INTEL_GPU
#define N_DST 4
#define N_SIMDGROUP 1
#define N_SIMDWIDTH 16
#elif defined(ADRENO_GPU)
#define N_DST 4
#define N_SIMDGROUP 1
#define N_SIMDWIDTH 64
#endif
#define BLOCK_STRIDE (N_SIMDWIDTH/8)
#ifdef INTEL_GPU
REQD_SUBGROUP_SIZE_16
#elif defined (ADRENO_GPU)
REQD_SUBGROUP_SIZE_64
#endif
kernel void kernel_mul_mv_q5_K_f32(
global char * src0,
int offset0,
global char * src1,
int offset1,
global char * dst,
int offsetd,
int ne00,
int ne01,
ulong nb01,
ulong nb02,
ulong nb03,
int ne12,
ulong nb11,
ulong nb12,
ulong nb13,
int ne0,
int ne1,
int r2,
int r3
) {
src0 = src0 + offset0;
src1 = src1 + offset1;
dst = dst + offsetd;
ushort kmask1 = 0x3f3f;
ushort kmask2 = 0x0f0f;
ushort kmask3 = 0xc0c0;
int ix = get_sub_group_local_id()/8; // super block index
int it = get_sub_group_local_id()%8; // block index (inside super block)
int iq = it/4; // 0 or 1 - first or second half of the super block
int ir = it%4; // 0...3 - block index in the half super block
int nb = ne00/QK_K;
int r0 = get_group_id(0);
int r1 = get_group_id(1);
int im = get_group_id(2);
int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
int i12 = im%ne12;
int i13 = im/ne12;
int offset_src0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
int offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
global block_q5_K * x = (global block_q5_K *) (src0 + offset_src0);
global float * y = (global float *) (src1 + offset_src1);
float yl[16];
float yh[16];
float sumf[N_DST] = {0.f};
float all_sum;
global float * y4 = y + ix * QK_K + 64 * iq + 8 * ir;
uchar u1_lo = (uchar)(1 << (2*iq));
uchar u2_lo = (uchar)(2 << (2*iq));
uchar u1_hi = (uchar)(1 << (2*iq + 4));
uchar u2_hi = (uchar)(2 << (2*iq + 4));
ushort sc16[4];
uchar * sc8 = (uchar *)sc16;
for (int ib = ix; ib < nb; ib += BLOCK_STRIDE) {
float4 sumy = {0.f, 0.f, 0.f, 0.f};
for (int i = 0; i < 8; ++i) {
yl[i+0] = y4[i+0];
sumy.s0 += yl[i+0];
yl[i+8] = y4[i+32];
sumy.s1 += yl[i+8];
yh[i+0] = y4[i+128];
sumy.s2 += yh[i+0];
yh[i+8] = y4[i+160];
sumy.s3 += yh[i+8];
}
global ushort * sc = (global ushort *)x[ib].scales + iq;
global ushort * q1 = (global ushort *)x[ib].qs + 16 * iq + 4 * ir;
global uchar * qh = x[ib].qh + 8 * ir;
global half * dh = &x[ib].d;
for (int row = 0; row < N_DST; row++) {
sc16[0] = sc[0] & kmask1;
sc16[1] = sc[2] & kmask1;
sc16[2] = ((sc[4] >> 0) & kmask2) | ((sc[0] & kmask3) >> 2);
sc16[3] = ((sc[4] >> 4) & kmask2) | ((sc[2] & kmask3) >> 2);
global ushort * q2 = q1 + 32;
float4 acc1 = {0.f, 0.f, 0.f, 0.f};
float4 acc2 = {0.f, 0.f, 0.f, 0.f};
for (int i = 0; i < 8; i += 2) {
acc1.s0 += yl[i+0] * ((q1[i/2] & 0x000F) + (qh[i+0] & u1_lo ? 16.f : 0.f));
acc1.s1 += yl[i+1] * ((q1[i/2] & 0x0F00) + (qh[i+1] & u1_lo ? 16.f*256.f : 0.f));
acc1.s2 += yl[i+8] * ((q1[i/2] & 0x00F0) + (qh[i+0] & u2_lo ? 16.f*16.f : 0.f));
acc1.s3 += yl[i+9] * ((q1[i/2] & 0xF000) + (qh[i+1] & u2_lo ? 16.f*4096.f: 0.f));
acc2.s0 += yh[i+0] * ((q2[i/2] & 0x000F) + (qh[i+0] & u1_hi ? 16.f : 0.f));
acc2.s1 += yh[i+1] * ((q2[i/2] & 0x0F00) + (qh[i+1] & u1_hi ? 16.f*256.f : 0.f));
acc2.s2 += yh[i+8] * ((q2[i/2] & 0x00F0) + (qh[i+0] & u2_hi ? 16.f*16.f : 0.f));
acc2.s3 += yh[i+9] * ((q2[i/2] & 0xF000) + (qh[i+1] & u2_hi ? 16.f*4096.f: 0.f));
}
float dall = dh[0];
float dmin = dh[1];
sumf[row] += dall * ((acc1.s0 + 1.f/256.f * acc1.s1) * sc8[0] +
(acc1.s2 + 1.f/256.f * acc1.s3) * sc8[1] * 1.f/16.f +
(acc2.s0 + 1.f/256.f * acc2.s1) * sc8[4] +
(acc2.s2 + 1.f/256.f * acc2.s3) * sc8[5] * 1.f/16.f) -
dmin * (sumy.s0 * sc8[2] + sumy.s1 * sc8[3] + sumy.s2 * sc8[6] + sumy.s3 * sc8[7]);
q1 += nb01/2;
sc += nb01/2;
dh += nb01/2;
qh += nb01;
}
y4 += BLOCK_STRIDE * QK_K;
}
global float * dst_f32 = (global float *) dst + im*ne0*ne1 + r1*ne0;
for (int row = 0; row < N_DST; ++row) {
all_sum = sub_group_reduce_add(sumf[row]);
if (first_row + row < ne01) {
if (get_sub_group_local_id() == 0) {
dst_f32[first_row + row] = all_sum;
}
}
}
}

View File

@@ -1,203 +0,0 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#ifdef cl_intel_subgroups
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
#else
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
#endif
#ifdef cl_intel_required_subgroup_size
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
#define INTEL_GPU 1
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
#elif defined(cl_qcom_reqd_sub_group_size)
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
#define ADRENO_GPU 1
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
#endif
//------------------------------------------------------------------------------
// block_q5_K
//------------------------------------------------------------------------------
#define QK_K 256
#define BLOCK_Q5K_SIZE 176
#define K_SCALE_SIZE 12
typedef struct {
half d; // super-block scale for quantized scales
half dmin; // super-block scale for quantized mins
uchar scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uchar qh[QK_K/8]; // quants, high bit (1 bit per value, packed 8 per byte)
uchar qs[QK_K/2]; // quants, low 4 bits (2 values per byte)
} block_q5_K;
#undef N_DST
#undef N_SIMDGROUP
#undef N_SIMDWIDTH
#ifdef INTEL_GPU
#define N_DST 4
#define N_SIMDGROUP 1
#define N_SIMDWIDTH 16
#elif defined(ADRENO_GPU)
#define N_DST 16
#define N_SIMDGROUP 2
#define N_SIMDWIDTH 64
#endif
#undef BLOCK_STRIDE
// number of (super) blocks each subgroup processes
// each thread in a subgroup processes a block (32 weights)
#define BLOCK_STRIDE (N_SIMDWIDTH/8)
#ifdef INTEL_GPU
REQD_SUBGROUP_SIZE_16
#elif defined (ADRENO_GPU)
REQD_SUBGROUP_SIZE_64
#endif
kernel void kernel_mul_mv_q5_K_f32_flat(
global uchar * src0_q,
global uchar * src0_qh,
global uchar * src0_s,
global half * src0_d,
global half * src0_dm,
global char * src1,
int offset1,
global char * dst,
int offsetd,
int ne00,
int ne01,
ulong nb01,
ulong nb02,
ulong nb03,
int ne12,
ulong nb11,
ulong nb12,
ulong nb13,
int ne0,
int ne1,
int r2,
int r3
) {
src1 = src1 + offset1;
dst = dst + offsetd;
ushort kmask1 = 0x3f3f;
ushort kmask2 = 0x0f0f;
ushort kmask3 = 0xc0c0;
int ix = get_sub_group_local_id()/8;
int it = get_sub_group_local_id()%8;
int iq = it/4;
int ir = it%4;
int nb = ne00/QK_K;
int r0 = get_group_id(0);
int r1 = get_group_id(1);
int im = get_group_id(2);
int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
int i12 = im%ne12;
int i13 = im/ne12;
int offset_src0 = (first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03)/BLOCK_Q5K_SIZE;
uint blk = nb01 / BLOCK_Q5K_SIZE;
global uchar * blk_q = (global uchar *)src0_q + offset_src0*(QK_K/2);
global uchar * blk_qh = (global uchar *)src0_qh + offset_src0*(QK_K/8);
global uchar * blk_s = (global uchar *)src0_s + offset_src0*K_SCALE_SIZE;
global half * blk_d = (global half *)src0_d + offset_src0;
global half * blk_dm = (global half *)src0_dm + offset_src0;
int offset_src1 = r1*nb11 + (i12)*nb12 + (i13)*nb13;
global float * y = (global float *)(src1 + offset_src1);
float yl[16];
float yh[16];
float sumf[N_DST] = {0.f};
float all_sum;
global float * y4 = y + ix * QK_K + 64 * iq + 8 * ir;
uchar u1_lo = (uchar)(1 << (2*iq));
uchar u2_lo = (uchar)(2 << (2*iq));
uchar u1_hi = (uchar)(1 << (2*iq + 4));
uchar u2_hi = (uchar)(2 << (2*iq + 4));
ushort sc16[4];
uchar * sc8 = (uchar *)sc16;
for (int ib = ix; ib < nb; ib += BLOCK_STRIDE) {
float4 sumy = {0.f, 0.f, 0.f, 0.f};
for (int i = 0; i < 8; ++i) {
yl[i+0] = y4[i+0];
sumy.s0 += yl[i+0];
yl[i+8] = y4[i+32];
sumy.s1 += yl[i+8];
yh[i+0] = y4[i+128];
sumy.s2 += yh[i+0];
yh[i+8] = y4[i+160];
sumy.s3 += yh[i+8];
}
global ushort * q1 = (global ushort *)(blk_q + ib * (QK_K/2)) + (16 * iq + 4 * ir);
global uchar * qh = (global uchar *)(blk_qh + ib * (QK_K/8)) + 8 * ir;
global ushort * sc = (global ushort *)(blk_s + ib * K_SCALE_SIZE) + iq;
global half * d = blk_d + ib;
global half * dm = blk_dm + ib;
for (int row = 0; row < N_DST; row++) {
sc16[0] = sc[0] & kmask1;
sc16[1] = sc[2] & kmask1;
sc16[2] = ((sc[4] >> 0) & kmask2) | ((sc[0] & kmask3) >> 2);
sc16[3] = ((sc[4] >> 4) & kmask2) | ((sc[2] & kmask3) >> 2);
global ushort * q2 = q1 + 32;
float4 acc1 = {0.f, 0.f, 0.f, 0.f};
float4 acc2 = {0.f, 0.f, 0.f, 0.f};
for (int i = 0; i < 8; i += 2) {
acc1.s0 += yl[i+0] * ((q1[i/2] & 0x000F) + (qh[i+0] & u1_lo ? 16.f : 0.f));
acc1.s1 += yl[i+1] * ((q1[i/2] & 0x0F00) + (qh[i+1] & u1_lo ? 16.f*256.f : 0.f));
acc1.s2 += yl[i+8] * ((q1[i/2] & 0x00F0) + (qh[i+0] & u2_lo ? 16.f*16.f : 0.f));
acc1.s3 += yl[i+9] * ((q1[i/2] & 0xF000) + (qh[i+1] & u2_lo ? 16.f*4096.f: 0.f));
acc2.s0 += yh[i+0] * ((q2[i/2] & 0x000F) + (qh[i+0] & u1_hi ? 16.f : 0.f));
acc2.s1 += yh[i+1] * ((q2[i/2] & 0x0F00) + (qh[i+1] & u1_hi ? 16.f*256.f : 0.f));
acc2.s2 += yh[i+8] * ((q2[i/2] & 0x00F0) + (qh[i+0] & u2_hi ? 16.f*16.f : 0.f));
acc2.s3 += yh[i+9] * ((q2[i/2] & 0xF000) + (qh[i+1] & u2_hi ? 16.f*4096.f: 0.f));
}
float dall = *d;
float dmin = *dm;
sumf[row] += dall * ((acc1.s0 + 1.f/256.f * acc1.s1) * sc8[0] +
(acc1.s2 + 1.f/256.f * acc1.s3) * sc8[1] * 1.f/16.f +
(acc2.s0 + 1.f/256.f * acc2.s1) * sc8[4] +
(acc2.s2 + 1.f/256.f * acc2.s3) * sc8[5] * 1.f/16.f) -
dmin * (sumy.s0 * sc8[2] + sumy.s1 * sc8[3] + sumy.s2 * sc8[6] + sumy.s3 * sc8[7]);
q1 += blk*64;
qh += blk*32;
sc += blk*6;
d += blk;
dm += blk;
}
y4 += BLOCK_STRIDE * QK_K;
}
global float * dst_f32 = (global float *) dst + im*ne0*ne1 + r1*ne0;
for (int row = 0; row < N_DST; ++row) {
all_sum = sub_group_reduce_add(sumf[row]);
if (first_row + row < ne01) {
if (get_sub_group_local_id() == 0) {
dst_f32[first_row + row] = all_sum;
}
}
}
}

View File

@@ -412,8 +412,6 @@ static const ggml_backend_buffer_i ggml_backend_openvino_buffer_interface = {
/* .memset_tensor = */ ggml_backend_openvino_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_openvino_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_openvino_buffer_get_tensor,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ ggml_backend_openvino_buffer_cpy_tensor,
/* .clear = */ ggml_backend_openvino_buffer_clear,
/* .reset = */ NULL,
@@ -619,8 +617,6 @@ static const ggml_backend_i ggml_backend_openvino_interface = {
/* .free = */ ggml_backend_openvino_free,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .get_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL,

View File

@@ -589,7 +589,6 @@ void ggml_opt_free(ggml_opt_context_t opt_ctx) {
ggml_backend_buffer_free(opt_ctx->buf_cpu);
ggml_free(opt_ctx->ctx_static);
ggml_free(opt_ctx->ctx_cpu);
ggml_free(opt_ctx->ctx_copy);
delete opt_ctx;
}

View File

@@ -706,8 +706,6 @@ static ggml_backend_buffer_i ggml_backend_rpc_buffer_interface = {
/* .memset_tensor = */ NULL,
/* .set_tensor = */ ggml_backend_rpc_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_rpc_buffer_get_tensor,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ ggml_backend_rpc_buffer_cpy_tensor,
/* .clear = */ ggml_backend_rpc_buffer_clear,
/* .reset = */ NULL,
@@ -896,8 +894,6 @@ static ggml_backend_i ggml_backend_rpc_interface = {
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .synchronize = */ ggml_backend_rpc_synchronize,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,

View File

@@ -488,7 +488,7 @@ static void dequantize_row_nvfp4_sycl(const void * vx, dst_t * y, const int64_t
const int nb = k / QK_NVFP4;
stream->parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
[=](sycl::nd_item<3> /*item_ct1*/) {
[=](sycl::nd_item<3> item_ct1) {
dequantize_block_nvfp4(vx, y, k);
});
}

View File

@@ -14,7 +14,6 @@
#define GGML_SYCL_DEQUANTIZE_HPP
#include "common.hpp"
#include "convert.hpp"
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
typedef void (*dequantize_kernel_t_reorder)(const void *d, const int64_t ib, const void *qs,

View File

@@ -355,7 +355,7 @@ static void acc_f32_sycl(const float *x, const float *y, float *dst,
const int num_blocks = (n_elements + SYCL_ACC_BLOCK_SIZE - 1) / SYCL_ACC_BLOCK_SIZE;
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE),
sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE)),
[=](sycl::nd_item<3> /*item_ct1*/) {
[=](sycl::nd_item<3> item_ct1) {
acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, ne13, s1, s2, s3, offset);
});
}

View File

@@ -44,10 +44,6 @@ void ggml_sycl_flash_attn_ext_tile(ggml_backend_sycl_context & ctx, ggml_tensor
GGML_ASSERT(V->ne[0] == K->ne[0]);
ggml_sycl_flash_attn_ext_tile_case<256, 256>(ctx, dst);
} break;
case 512: {
GGML_ASSERT(V->ne[0] == K->ne[0]);
ggml_sycl_flash_attn_ext_tile_case<512, 512>(ctx, dst);
} break;
case 576: {
GGML_ASSERT(V->ne[0] == 512);
ggml_sycl_flash_attn_ext_tile_case<576, 512>(ctx, dst);

View File

@@ -67,12 +67,6 @@ static constexpr uint32_t ggml_sycl_fattn_tile_get_config_fp16(const int DKQ, co
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(512, 512, 2, 64, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(512, 512, 4, 128, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(512, 512, 8, 256, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(512, 512, 16, 256, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(512, 512, 32, 256, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 4, 128, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 8, 256, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2, 64, 64)
@@ -130,12 +124,6 @@ static constexpr uint32_t ggml_sycl_fattn_tile_get_config_fp32(const int DKQ, co
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2, 32, 128)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2, 32, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(512, 512, 2, 128, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(512, 512, 4, 128, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(512, 512, 8, 256, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(512, 512, 16, 256, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(512, 512, 32, 256, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 4, 128, 2, 32, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 8, 256, 2, 32, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2, 32, 64)
@@ -143,6 +131,134 @@ static constexpr uint32_t ggml_sycl_fattn_tile_get_config_fp32(const int DKQ, co
return 0;
}
static constexpr uint32_t ggml_sycl_fattn_tile_get_config_amd(const int DKQ, const int DV, const int ncols) {
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 40, 40, 2, 64, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 40, 40, 4, 128, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 40, 40, 8, 256, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 40, 40, 16, 256, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 40, 40, 32, 256, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 40, 40, 64, 256, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 64, 64, 2, 64, 3, 32, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 64, 64, 4, 128, 3, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 64, 64, 8, 128, 2, 32, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 64, 64, 16, 256, 2, 128, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 64, 64, 32, 256, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 64, 64, 64, 256, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 72, 72, 2, 64, 2, 32, 72)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 72, 72, 4, 128, 2, 32, 72)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 72, 72, 8, 256, 2, 32, 72)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 72, 72, 16, 256, 2, 32, 72)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 72, 72, 32, 256, 2, 32, 72)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 72, 72, 64, 256, 2, 32, 72)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 80, 80, 2, 64, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 80, 80, 4, 128, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 80, 80, 8, 256, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 80, 80, 16, 256, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 80, 80, 32, 256, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 80, 80, 64, 256, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 96, 96, 2, 64, 2, 32, 48)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 96, 96, 4, 128, 2, 32, 48)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 96, 96, 8, 256, 2, 32, 48)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 96, 96, 16, 256, 2, 32, 48)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 96, 96, 32, 256, 2, 32, 48)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 96, 96, 64, 256, 2, 32, 48)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(112, 112, 2, 64, 2, 32, 56)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(112, 112, 4, 128, 2, 32, 56)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(112, 112, 8, 256, 2, 32, 56)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(112, 112, 16, 256, 2, 32, 56)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(112, 112, 32, 256, 2, 32, 56)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(112, 112, 64, 256, 2, 32, 56)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(128, 128, 2, 256, 2, 128, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(128, 128, 4, 128, 2, 64, 128)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(128, 128, 8, 256, 2, 64, 128)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(128, 128, 16, 256, 2, 64, 128)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(128, 128, 32, 256, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(128, 128, 64, 256, 2, 64, 32)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 2, 256, 2, 128, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 4, 256, 2, 64, 128)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 8, 256, 2, 64, 128)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2, 32, 128)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2, 32, 128)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 4, 128, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 8, 256, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 32, 512, 1, 128, 64)
return 0;
}
static constexpr uint32_t ggml_sycl_fattn_tile_get_config_amd_rdna(const int DKQ, const int DV, const int ncols) {
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 40, 40, 2, 64, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 40, 40, 4, 128, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 40, 40, 8, 256, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 40, 40, 16, 256, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 40, 40, 32, 256, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 40, 40, 64, 256, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 64, 64, 2, 64, 8, 32, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 64, 64, 4, 64, 8, 32, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 64, 64, 8, 128, 5, 128, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 64, 64, 16, 128, 5, 128, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 64, 64, 32, 128, 4, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 64, 64, 64, 128, 5, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 72, 72, 2, 64, 2, 32, 72)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 72, 72, 4, 128, 2, 32, 72)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 72, 72, 8, 256, 2, 32, 72)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 72, 72, 16, 256, 2, 32, 72)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 72, 72, 32, 256, 2, 32, 72)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 72, 72, 64, 256, 2, 32, 72)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 80, 80, 2, 64, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 80, 80, 4, 128, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 80, 80, 8, 256, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 80, 80, 16, 256, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 80, 80, 32, 256, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 80, 80, 64, 256, 2, 32, 40)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 96, 96, 2, 64, 2, 32, 48)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 96, 96, 4, 128, 2, 32, 48)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 96, 96, 8, 256, 2, 32, 48)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 96, 96, 16, 256, 2, 32, 48)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 96, 96, 32, 256, 2, 32, 48)
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 96, 96, 64, 256, 2, 32, 48)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(112, 112, 2, 64, 2, 32, 56)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(112, 112, 4, 128, 2, 32, 56)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(112, 112, 8, 256, 2, 32, 56)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(112, 112, 16, 256, 2, 32, 56)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(112, 112, 32, 256, 2, 32, 56)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(112, 112, 64, 256, 2, 32, 56)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(128, 128, 2, 64, 8, 32, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(128, 128, 4, 128, 8, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(128, 128, 8, 128, 8, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(128, 128, 16, 256, 3, 128, 128)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(128, 128, 32, 256, 3, 128, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(128, 128, 64, 256, 3, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 2, 64, 8, 32, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 4, 128, 6, 32, 256)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 8, 128, 6, 32, 256)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 5, 32, 256)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 3, 64, 128)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 4, 128, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 8, 256, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 4, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 32, 256, 2, 128, 64)
return 0;
}
static constexpr uint32_t ggml_sycl_fattn_tile_get_config(const int DKQ, const int DV, const int ncols, const int cc) {
if(fast_fp16_available(cc))
return ggml_sycl_fattn_tile_get_config_fp16(DKQ, DV, ncols);
@@ -1177,16 +1293,6 @@ static void launch_fattn_tile_switch_ncols2(ggml_backend_sycl_context & ctx, ggm
launch_fattn_tile_switch_ncols1<DKQ, DV, 4, use_logit_softcap>(ctx, dst);
return;
}
// ncols2=2 and ncols2=1 fallbacks only for cases where ncols=2 config exists (DKQ == DV).
// For DKQ == 576, DV == 512 only GQA-optimized variants are implemented.
if constexpr (DKQ == DV) {
if (use_gqa_opt && gqa_ratio % 2 == 0) {
launch_fattn_tile_switch_ncols1<DKQ, DV, 2, use_logit_softcap>(ctx, dst);
return;
}
launch_fattn_tile_switch_ncols1<DKQ, DV, 1, use_logit_softcap>(ctx, dst);
return;
}
}
if constexpr (DV <= 256) {
@@ -1241,6 +1347,5 @@ extern DECL_FATTN_TILE_CASE( 96, 96);
extern DECL_FATTN_TILE_CASE(112, 112);
extern DECL_FATTN_TILE_CASE(128, 128);
extern DECL_FATTN_TILE_CASE(256, 256);
extern DECL_FATTN_TILE_CASE(512, 512);
extern DECL_FATTN_TILE_CASE(576, 512);

View File

@@ -664,11 +664,4 @@ EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q5_0)
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q5_1)
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q8_0)
EXTERN_DECL_FATTN_VEC_CASES(512, GGML_TYPE_F16)
EXTERN_DECL_FATTN_VEC_CASES(512, GGML_TYPE_Q4_0)
EXTERN_DECL_FATTN_VEC_CASES(512, GGML_TYPE_Q4_1)
EXTERN_DECL_FATTN_VEC_CASES(512, GGML_TYPE_Q5_0)
EXTERN_DECL_FATTN_VEC_CASES(512, GGML_TYPE_Q5_1)
EXTERN_DECL_FATTN_VEC_CASES(512, GGML_TYPE_Q8_0)
#endif // GGML_SYCL_FATTN_VEC_HPP

View File

@@ -34,7 +34,6 @@
FATTN_VEC_CASE( 64, type_K, type_V) \
FATTN_VEC_CASE(128, type_K, type_V) \
FATTN_VEC_CASE(256, type_K, type_V) \
FATTN_VEC_CASE(512, type_K, type_V) \
static void ggml_sycl_flash_attn_ext_vec(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_tensor * Q = dst->src[0];
@@ -142,7 +141,6 @@ static best_fattn_kernel ggml_sycl_get_best_fattn_kernel(const int device, const
case 128:
case 112:
case 256:
case 512:
if (V->ne[0] != K->ne[0]) {
return BEST_FATTN_KERNEL_NONE;
}
@@ -187,7 +185,7 @@ static best_fattn_kernel ggml_sycl_get_best_fattn_kernel(const int device, const
}
// For small batch sizes the vector kernel may be preferable over the kernels optimized for large batch sizes:
const bool can_use_vector_kernel = Q->ne[0] <= 512 && Q->ne[0] % 64 == 0 && K->ne[1] % FATTN_KQ_STRIDE == 0;
const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % 64 == 0 && K->ne[1] % FATTN_KQ_STRIDE == 0;
// Todo: Use the XMX kernel if possible:

Some files were not shown because too many files have changed in this diff Show More