mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-23 16:37:33 +03:00
Compare commits
29 Commits
b7867
...
gg/compare
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5b01d8575d | ||
|
|
1488339138 | ||
|
|
4927795810 | ||
|
|
971facc38e | ||
|
|
d9a2a4bcaa | ||
|
|
dfd6106c84 | ||
|
|
bbada8bfb9 | ||
|
|
13f3ebfae1 | ||
|
|
dabaa2e77a | ||
|
|
2e916f996a | ||
|
|
f3bc98890c | ||
|
|
c3b87cebff | ||
|
|
0562503154 | ||
|
|
83bcdf7217 | ||
|
|
b316895ff9 | ||
|
|
ecbf01d441 | ||
|
|
1025fd2c09 | ||
|
|
c7358ddf64 | ||
|
|
d284baf1b5 | ||
|
|
bd90fc74c3 | ||
|
|
ce38a4db47 | ||
|
|
4fdbc1e4db | ||
|
|
7b7ae857f6 | ||
|
|
84b0a98319 | ||
|
|
b45ef2702c | ||
|
|
f3dd7b8e68 | ||
|
|
eed25bc6b0 | ||
|
|
b33df266d0 | ||
|
|
3bcc990997 |
8
.github/workflows/build.yml
vendored
8
.github/workflows/build.yml
vendored
@@ -21,7 +21,8 @@ on:
|
||||
'**/*.m',
|
||||
'**/*.metal',
|
||||
'**/*.comp',
|
||||
'**/*.glsl'
|
||||
'**/*.glsl',
|
||||
'**/*.wgsl'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
@@ -42,7 +43,8 @@ on:
|
||||
'**/*.m',
|
||||
'**/*.metal',
|
||||
'**/*.comp',
|
||||
'**/*.glsl'
|
||||
'**/*.glsl',
|
||||
'**/*.wgsl'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
@@ -1371,7 +1373,7 @@ jobs:
|
||||
id: update_presets
|
||||
if: ${{ matrix.build == 'arm64-snapdragon' }}
|
||||
run: |
|
||||
cp docs/backend/hexagon/CMakeUserPresets.json .
|
||||
cp docs/backend/snapdragon/CMakeUserPresets.json .
|
||||
|
||||
- name: Build
|
||||
id: ndk_build
|
||||
|
||||
@@ -213,6 +213,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
- [llama.vim](https://github.com/ggml-org/llama.vim) (MIT)
|
||||
- [LARS](https://github.com/abgulati/LARS) (AGPL)
|
||||
- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
|
||||
- [LlamaLib](https://github.com/undreamai/LlamaLib) (Apache-2.0)
|
||||
- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
|
||||
- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
|
||||
- [LMStudio](https://lmstudio.ai/) (proprietary)
|
||||
|
||||
@@ -75,6 +75,8 @@ add_library(${TARGET} STATIC
|
||||
ngram-cache.h
|
||||
ngram-map.cpp
|
||||
ngram-map.h
|
||||
ngram-mod.cpp
|
||||
ngram-mod.h
|
||||
peg-parser.cpp
|
||||
peg-parser.h
|
||||
preset.cpp
|
||||
|
||||
@@ -1301,7 +1301,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params, bool value) {
|
||||
params.kv_unified = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED}));
|
||||
).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED, LLAMA_EXAMPLE_BENCH}));
|
||||
add_opt(common_arg(
|
||||
{"--context-shift"},
|
||||
{"--no-context-shift"},
|
||||
@@ -3396,7 +3396,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
||||
add_opt(common_arg(
|
||||
{"--spec-type"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v]",
|
||||
{"--spec-type"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]",
|
||||
string_format("type of speculative decoding to use when no draft model is provided (default: %s)\n",
|
||||
common_speculative_type_to_str(params.speculative.type).c_str()),
|
||||
[](common_params & params, const std::string & value) {
|
||||
@@ -3410,6 +3410,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K;
|
||||
} else if (value == "ngram-map-k4v") {
|
||||
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V;
|
||||
} else if (value == "ngram-mod") {
|
||||
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD;
|
||||
} else {
|
||||
throw std::invalid_argument("unknown speculative decoding type without draft model");
|
||||
}
|
||||
|
||||
172
common/chat.cpp
172
common/chat.cpp
@@ -771,10 +771,12 @@ static std::string apply(
|
||||
|
||||
nlohmann::ordered_json inp = nlohmann::ordered_json{
|
||||
{"messages", messages_override.has_value() ? *messages_override : inputs.messages},
|
||||
{"tools", tools_override.has_value() ? *tools_override : inputs.tools},
|
||||
{"bos_token", tmpl.bos_token()},
|
||||
{"eos_token", tmpl.eos_token()},
|
||||
};
|
||||
if (tools_override.has_value() || !inputs.tools.empty()) {
|
||||
inp["tools"] = tools_override.has_value() ? *tools_override : inputs.tools;
|
||||
}
|
||||
if (inputs.extra_context.is_object()) {
|
||||
// TODO: do we need to merge, or replacing is fine?
|
||||
for (const auto & [k, v] : inputs.extra_context.items()) {
|
||||
@@ -790,9 +792,6 @@ static std::string apply(
|
||||
if (inputs.add_generation_prompt) {
|
||||
inp["add_generation_prompt"] = true;
|
||||
}
|
||||
if (inp["tools"].is_null()) {
|
||||
inp["tools"] = json::array();
|
||||
}
|
||||
|
||||
jinja::global_from_json(ctx, inp, inputs.mark_input);
|
||||
|
||||
@@ -2219,12 +2218,11 @@ static common_chat_params common_chat_params_init_glm_4_5(const common_chat_temp
|
||||
static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
||||
LOG_DBG("%s\n", __func__);
|
||||
common_chat_params data;
|
||||
const std::optional<json> tools_override = json();
|
||||
const std::optional<json> additional_context = json {
|
||||
{"datetime", format_time(inputs.now, "%b %d %Y %H:%M:%S GMT")},
|
||||
{"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
|
||||
};
|
||||
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, tools_override, additional_context);
|
||||
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override =*/ std::nullopt, additional_context);
|
||||
if (inputs.tools.is_array() && !inputs.tools.empty()) {
|
||||
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
@@ -2573,20 +2571,165 @@ static common_chat_params common_chat_params_init_granite(const common_chat_temp
|
||||
static common_chat_params common_chat_params_init_solar_open(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
||||
common_chat_params data;
|
||||
|
||||
// TODO: Reasoning effort
|
||||
json additional_context = {};
|
||||
// Copy `reasoning_content` to `reasoning`
|
||||
auto adjusted_messages = json::array();
|
||||
for (const auto & msg : inputs.messages) {
|
||||
if (msg.contains("reasoning_content") && msg.at("reasoning_content").is_string()) {
|
||||
auto adjusted_message = msg;
|
||||
adjusted_message["reasoning"] = msg.at("reasoning_content");
|
||||
adjusted_message.erase("reasoning_content");
|
||||
adjusted_messages.push_back(adjusted_message);
|
||||
} else {
|
||||
adjusted_messages.push_back(msg);
|
||||
}
|
||||
}
|
||||
|
||||
data.prompt = apply(tmpl, inputs, std::nullopt, std::nullopt, additional_context);
|
||||
data.format = COMMON_CHAT_FORMAT_SOLAR_OPEN;
|
||||
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
|
||||
auto include_grammar = true;
|
||||
|
||||
auto prompt = apply(tmpl, inputs, /* messages_override= */ adjusted_messages);
|
||||
|
||||
// Check if we need to replace the flush token with end token during inference and without generation prompt.
|
||||
if (inputs.is_inference && !inputs.add_generation_prompt) {
|
||||
static constexpr std::string_view return_token = "<|flush|>";
|
||||
static constexpr std::string_view end_token = "<|end|>";
|
||||
if (size_t pos = prompt.rfind(return_token); pos != std::string::npos) {
|
||||
prompt.replace(pos, return_token.length(), end_token);
|
||||
}
|
||||
}
|
||||
|
||||
data.prompt = prompt;
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.preserved_tokens = {
|
||||
"<|think|>",
|
||||
"<|content|>",
|
||||
"<|begin|>",
|
||||
"<|end|>",
|
||||
"<|tool_calls|>",
|
||||
"<|tool_call:begin|>",
|
||||
"<|tool_call:end|>",
|
||||
"<|tool_call:name|>",
|
||||
"<|tool_call:args|>",
|
||||
};
|
||||
|
||||
// TODO: Tool calling
|
||||
auto parser = build_chat_peg_native_parser([&](common_chat_peg_native_builder & p) {
|
||||
auto lit_think = p.atomic(p.literal("<|think|>"));
|
||||
auto lit_assistant_begin = p.atomic(p.literal("<|begin|>assistant"));
|
||||
auto lit_content = p.atomic(p.literal("<|content|>"));
|
||||
auto lit_end = p.atomic(p.literal("<|end|>"));
|
||||
auto parser_until_end = p.until("<|end|>");
|
||||
|
||||
// reasoning <- "<|think|>" (!"<|end|>" .)*
|
||||
auto parser_reasoning = p.rule("reasoning", lit_think + p.reasoning(parser_until_end));
|
||||
|
||||
// content <- "<|content|>" (!"<|end|>" .)*
|
||||
auto parser_content = p.rule("content", lit_content + p.content(parser_until_end));
|
||||
|
||||
// wrap_choice(items) <- item-choice wrapped*
|
||||
// item-choice <- items[0] / ... / items[n]
|
||||
// wrapped <- "<|end|><|begin|>assistant" item-choice
|
||||
auto wrap_choice = [&](const std::vector<common_peg_parser> & items) {
|
||||
auto choice = p.choice(items);
|
||||
return choice + p.zero_or_more(lit_end + lit_assistant_begin + choice);
|
||||
};
|
||||
|
||||
// wrap_seq(items) <- item[0] "<|end|><|begin|>assistant" item[1] ...
|
||||
auto wrap_seq = [&](const std::vector<common_peg_parser> & items) {
|
||||
auto seq = p.sequence();
|
||||
for (auto i = 0u; i < items.size(); i++) {
|
||||
if (i == 0) {
|
||||
seq += items[i];
|
||||
continue;
|
||||
}
|
||||
seq += lit_end + lit_assistant_begin + items[i];
|
||||
}
|
||||
return seq;
|
||||
};
|
||||
|
||||
// Response format parser
|
||||
if (inputs.json_schema.is_object() && !inputs.json_schema.empty()) {
|
||||
auto parser_response_format = lit_content + p.content(p.schema(p.json(), "response-format", inputs.json_schema));
|
||||
return p.choice({
|
||||
wrap_seq({parser_reasoning, parser_response_format}),
|
||||
wrap_seq({parser_response_format})
|
||||
});
|
||||
}
|
||||
|
||||
auto lit_tool_call_begin = p.literal("<|tool_call:begin|>");
|
||||
auto lit_tool_call_name = p.literal("<|tool_call:name|>");
|
||||
auto lit_tool_call_args = p.literal("<|tool_call:args|>");
|
||||
auto lit_tool_call_end = p.literal("<|tool_call:end|>");
|
||||
|
||||
// Tool call parser
|
||||
if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
|
||||
auto parser_tool_call = p.choice();
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & function = tool.at("function");
|
||||
std::string name = function.at("name");
|
||||
const auto & schema = function.at("parameters");
|
||||
|
||||
// tool(name, schema) <- name "<|tool_call:args|>" schema
|
||||
parser_tool_call |= p.rule("tool-" + name,
|
||||
p.atomic(p.tool_name(p.literal(name)) + lit_tool_call_args)
|
||||
+ p.tool_args(p.schema(p.json(), "tool-" + name + "-schema", schema)));
|
||||
});
|
||||
|
||||
auto min_calls = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED ? 1 : 0;
|
||||
auto max_calls = inputs.parallel_tool_calls ? -1 : 1;
|
||||
|
||||
// tool-calls <- "<|tool_calls|>" tool-call+
|
||||
// tool-call <- "<|tool_call:begin|> call-id "<|tool_call:name|>" &([^<]+ "<|tool_call:args|>") tool-choice "<|tool_call:end|>"
|
||||
// call-id <- [a-zA-Z0-9_-]+
|
||||
// tool-choice <- tool(t[0].name, t[0].schema) / ... / tool(t[n].name, t[n].schema)
|
||||
auto parser_tool_calls = p.trigger_rule("tool-calls",
|
||||
p.atomic(p.literal("<|tool_calls|>"))
|
||||
+ p.repeat(
|
||||
p.tool_open(
|
||||
lit_tool_call_begin
|
||||
+ p.tool_id(p.chars("[a-zA-Z0-9_-]", 1, -1))
|
||||
+ lit_tool_call_name
|
||||
+ p.peek(p.chars("[^<]", 1, -1) + lit_tool_call_args))
|
||||
+ parser_tool_call
|
||||
+ p.tool_close(lit_tool_call_end),
|
||||
/* min = */ 1,
|
||||
/* max = */ max_calls));
|
||||
|
||||
if (min_calls == 1) {
|
||||
// If required, then try any combination of the reasoning, content, and tool call
|
||||
return p.choice({
|
||||
wrap_seq({parser_reasoning, parser_content, parser_tool_calls}),
|
||||
wrap_seq({parser_reasoning, parser_tool_calls}),
|
||||
wrap_seq({parser_content, parser_tool_calls}),
|
||||
wrap_seq({parser_tool_calls})
|
||||
});
|
||||
}
|
||||
|
||||
return wrap_choice({parser_reasoning, parser_content, parser_tool_calls});
|
||||
}
|
||||
|
||||
// Content only parser
|
||||
include_grammar = false;
|
||||
return wrap_choice({parser_reasoning, parser_content});
|
||||
});
|
||||
|
||||
data.parser = parser.save();
|
||||
|
||||
if (include_grammar) {
|
||||
data.grammar_lazy = has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
|
||||
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & function = tool.at("function");
|
||||
auto schema = function.at("parameters");
|
||||
builder.resolve_refs(schema);
|
||||
});
|
||||
parser.build_grammar(builder, data.grammar_lazy);
|
||||
});
|
||||
|
||||
data.grammar_triggers = {
|
||||
{COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|tool_calls|>"}
|
||||
};
|
||||
}
|
||||
|
||||
return data;
|
||||
}
|
||||
@@ -3043,6 +3186,13 @@ static common_chat_params common_chat_templates_apply_jinja(
|
||||
return common_chat_params_init_apriel_1_5(tmpl, params);
|
||||
}
|
||||
|
||||
// Solar Open
|
||||
if (src.find("<|tool_response:begin|>") != std::string::npos &&
|
||||
src.find("<|tool_response:name|>") != std::string::npos &&
|
||||
src.find("<|tool_response:result|>") != std::string::npos) {
|
||||
return common_chat_params_init_solar_open(tmpl, params);
|
||||
}
|
||||
|
||||
// Use generic handler when mixing tools + JSON schema.
|
||||
// TODO: support that mix in handlers below.
|
||||
if ((params.tools.is_array() && params.json_schema.is_object())) {
|
||||
|
||||
@@ -171,6 +171,7 @@ enum common_speculative_type {
|
||||
COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, // simple self-speculative decoding
|
||||
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, // self-speculative decoding with n-gram keys only
|
||||
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
|
||||
COMMON_SPECULATIVE_TYPE_NGRAM_MOD,
|
||||
COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, // self-speculative decoding with 3-level n-gram cache
|
||||
COMMON_SPECULATIVE_TYPE_COUNT // number of types, unknown type
|
||||
};
|
||||
@@ -252,6 +253,8 @@ struct common_params_model {
|
||||
std::string name = ""; // in format <user>/<model>[:<tag>] (tag is optional) // NOLINT
|
||||
};
|
||||
|
||||
struct common_ngram_mod;
|
||||
|
||||
struct common_params_speculative {
|
||||
common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding
|
||||
|
||||
@@ -269,6 +272,8 @@ struct common_params_speculative {
|
||||
uint16_t ngram_check_rate = 1; // check rate for ngram lookup
|
||||
uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
|
||||
|
||||
std::shared_ptr<common_ngram_mod> ngram_mod;
|
||||
|
||||
std::string lookup_cache_static; // path of static ngram cache file for lookup decoding // NOLINT
|
||||
std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding // NOLINT
|
||||
|
||||
|
||||
@@ -1028,6 +1028,16 @@ const func_builtins & value_none_t::get_builtins() const {
|
||||
{"safe", [](const func_args &) -> value {
|
||||
return mk_val<value_string>("None");
|
||||
}},
|
||||
{"strip", [](const func_args &) -> value {
|
||||
return mk_val<value_string>("None");
|
||||
}},
|
||||
{"items", empty_value_fn<value_array>},
|
||||
{"map", empty_value_fn<value_array>},
|
||||
{"reject", empty_value_fn<value_array>},
|
||||
{"rejectattr", empty_value_fn<value_array>},
|
||||
{"select", empty_value_fn<value_array>},
|
||||
{"selectattr", empty_value_fn<value_array>},
|
||||
{"unique", empty_value_fn<value_array>},
|
||||
};
|
||||
return builtins;
|
||||
}
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
namespace jinja {
|
||||
|
||||
@@ -7,6 +7,21 @@
|
||||
#include <cstdio>
|
||||
#include <sstream>
|
||||
|
||||
// Print the values of a sublist of `llama_tokens & inp` to a string in the form [v0, v1, v2, ...].
|
||||
static std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length) {
|
||||
std::ostringstream oss;
|
||||
oss << '[';
|
||||
for (size_t i = 0; i < length; ++i) {
|
||||
if (i > 0) {
|
||||
oss << ", ";
|
||||
}
|
||||
oss << inp[start + i];
|
||||
}
|
||||
oss << ']';
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
|
||||
// n-gram simple
|
||||
//
|
||||
|
||||
@@ -100,8 +115,6 @@ llama_tokens common_ngram_simple_draft(
|
||||
// maximum number of counted values of a ngram map value.
|
||||
#define COMMON_NGRAM_MAX_VALUE_COUNT 16380
|
||||
|
||||
static std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length);
|
||||
|
||||
void common_ngram_map_draft(common_ngram_map & map,
|
||||
const llama_tokens & inp, llama_token sampled,
|
||||
llama_tokens & draft) {
|
||||
@@ -347,21 +360,3 @@ void common_ngram_map_accept(common_ngram_map & map, uint16_t n_accepted) {
|
||||
n_accepted, curr_value.n_accepted);
|
||||
curr_value.n_accepted = n_accepted;
|
||||
}
|
||||
|
||||
// Helper functions.
|
||||
//
|
||||
|
||||
// Print the values of a sublist of `llama_tokens & inp` to a string in the form [v0, v1, v2, ...].
|
||||
std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length) {
|
||||
std::ostringstream oss;
|
||||
oss << '[';
|
||||
for (size_t i = 0; i < length; ++i) {
|
||||
if (i > 0) {
|
||||
oss << ", ";
|
||||
}
|
||||
oss << inp[start + i];
|
||||
}
|
||||
oss << ']';
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
//
|
||||
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
|
||||
60
common/ngram-mod.cpp
Normal file
60
common/ngram-mod.cpp
Normal file
@@ -0,0 +1,60 @@
|
||||
#include "ngram-mod.h"
|
||||
|
||||
//
|
||||
// common_ngram_mod
|
||||
//
|
||||
|
||||
common_ngram_mod::common_ngram_mod(uint16_t n, size_t size) : n(n), used(0) {
|
||||
entries.resize(size);
|
||||
|
||||
reset();
|
||||
}
|
||||
|
||||
size_t common_ngram_mod::idx(const entry_t * tokens) const {
|
||||
size_t res = 0;
|
||||
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
res = res*6364136223846793005ULL + tokens[i];
|
||||
}
|
||||
|
||||
res = res % entries.size();
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void common_ngram_mod::add(const entry_t * tokens) {
|
||||
const size_t i = idx(tokens);
|
||||
|
||||
if (entries[i] == EMPTY) {
|
||||
used++;
|
||||
}
|
||||
|
||||
entries[i] = tokens[n];
|
||||
}
|
||||
|
||||
common_ngram_mod::entry_t common_ngram_mod::get(const entry_t * tokens) const {
|
||||
const size_t i = idx(tokens);
|
||||
|
||||
return entries[i];
|
||||
}
|
||||
|
||||
void common_ngram_mod::reset() {
|
||||
std::fill(entries.begin(), entries.end(), EMPTY);
|
||||
used = 0;
|
||||
}
|
||||
|
||||
size_t common_ngram_mod::get_n() const {
|
||||
return n;
|
||||
}
|
||||
|
||||
size_t common_ngram_mod::get_used() const {
|
||||
return used;
|
||||
}
|
||||
|
||||
size_t common_ngram_mod::size() const {
|
||||
return entries.size();
|
||||
}
|
||||
|
||||
size_t common_ngram_mod::size_bytes() const {
|
||||
return entries.size() * sizeof(entries[0]);
|
||||
}
|
||||
38
common/ngram-mod.h
Normal file
38
common/ngram-mod.h
Normal file
@@ -0,0 +1,38 @@
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
#include <cstddef>
|
||||
|
||||
//
|
||||
// common_ngram_mod
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/19164
|
||||
//
|
||||
|
||||
// basic n-gram hasher
|
||||
struct common_ngram_mod {
|
||||
using entry_t = int32_t;
|
||||
|
||||
static constexpr entry_t EMPTY = -1;
|
||||
|
||||
common_ngram_mod(uint16_t n, size_t size);
|
||||
|
||||
size_t idx(const entry_t * tokens) const;
|
||||
void add(const entry_t * tokens);
|
||||
entry_t get(const entry_t * tokens) const; // return -1 if not found
|
||||
|
||||
void reset();
|
||||
|
||||
size_t get_n() const;
|
||||
size_t get_used() const;
|
||||
|
||||
size_t size() const;
|
||||
size_t size_bytes() const;
|
||||
|
||||
private:
|
||||
size_t n; // ngram size to hash
|
||||
|
||||
size_t used;
|
||||
|
||||
std::vector<entry_t> entries;
|
||||
};
|
||||
@@ -6,6 +6,7 @@
|
||||
#include "log.h"
|
||||
#include "ngram-cache.h"
|
||||
#include "ngram-map.h"
|
||||
#include "ngram-mod.h"
|
||||
#include "sampling.h"
|
||||
|
||||
#include <algorithm>
|
||||
@@ -23,6 +24,7 @@ const std::vector<enum common_speculative_type> common_speculative_types = {
|
||||
COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE,
|
||||
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K,
|
||||
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V,
|
||||
COMMON_SPECULATIVE_TYPE_NGRAM_MOD,
|
||||
COMMON_SPECULATIVE_TYPE_NGRAM_CACHE
|
||||
};
|
||||
|
||||
@@ -33,6 +35,7 @@ const std::map<std::string, enum common_speculative_type> common_speculative_typ
|
||||
{"ngram_simple", COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE},
|
||||
{"ngram_map_k", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K},
|
||||
{"ngram_map_k4v", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V},
|
||||
{"ngram_mod", COMMON_SPECULATIVE_TYPE_NGRAM_MOD},
|
||||
{"ngram_cache", COMMON_SPECULATIVE_TYPE_NGRAM_CACHE}
|
||||
};
|
||||
|
||||
@@ -110,6 +113,8 @@ static bool common_speculative_are_compatible(
|
||||
struct common_speculative_state {
|
||||
const enum common_speculative_type type;
|
||||
|
||||
// TODO: rename to n_call_draft, n_gen_drafts, n_acc_drafts, n_gen_tokens, n_acc_tokens
|
||||
// TODO: add n_call_begin, n_call_accept
|
||||
size_t drafts_call_count = 0; // number of times this implementation was called.
|
||||
size_t drafts_generated_count = 0; // number of times a draft or part was generated by this implementation.
|
||||
size_t drafts_accepted_count = 0; // number of times a draft or part was accepted by the target model.
|
||||
@@ -119,6 +124,8 @@ struct common_speculative_state {
|
||||
// TODO: track performance of most recent calls
|
||||
const bool gen_perf = true; // whether to generate performance stats.
|
||||
|
||||
// TODO: rename to t_draft_us
|
||||
// TODO: add t_begin_us, t_accept_us
|
||||
int64_t gen_duration_us = 0; // total time spent in this implementation in microseconds.
|
||||
|
||||
common_speculative_state(enum common_speculative_type type) : type(type) {}
|
||||
@@ -509,6 +516,132 @@ struct common_speculative_state_ngram_map_k : public common_speculative_state {
|
||||
}
|
||||
};
|
||||
|
||||
struct common_speculative_state_ngram_mod : public common_speculative_state {
|
||||
common_ngram_mod & mod;
|
||||
|
||||
// the last position in the prompt that was added to the ngram container
|
||||
size_t i_last = 0;
|
||||
|
||||
// length of the last drafted n‑gram (number of tokens returned by draft)
|
||||
size_t n_draft_last = 0;
|
||||
|
||||
// consecutive accept rounds with low acceptance fraction (< 0.5)
|
||||
int n_low = 0;
|
||||
|
||||
// enable trace logging if LLAMA_TRACE is set
|
||||
const bool verbose;
|
||||
|
||||
common_speculative_state_ngram_mod(enum common_speculative_type type, common_ngram_mod & mod)
|
||||
: common_speculative_state(type), mod(mod), verbose(std::getenv("LLAMA_TRACE") != nullptr) {
|
||||
static_assert(sizeof(llama_token) == sizeof(common_ngram_mod::entry_t));
|
||||
}
|
||||
|
||||
void begin(const llama_tokens & prompt) override {
|
||||
i_last = 0;
|
||||
|
||||
n_draft_last = 0;
|
||||
|
||||
const size_t n = mod.get_n();
|
||||
|
||||
if (prompt.size() < n) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < prompt.size() - n; ++i) {
|
||||
mod.add(prompt.data() + i);
|
||||
}
|
||||
|
||||
i_last = prompt.size() - n;
|
||||
|
||||
const double f = (double)mod.get_used() / (double)mod.size();
|
||||
LOG_INF("%s: ngram_mod occupancy = %zu/%zu (%.2f)\n", __func__, mod.get_used(), mod.size(), f);
|
||||
|
||||
constexpr double f_thold = 0.25;
|
||||
if (f > f_thold) {
|
||||
LOG_WRN("%s: ngram_mod occupancy %.2f exceeds threshold (%.2f) - resetting\n", __func__, f, f_thold);
|
||||
|
||||
mod.reset();
|
||||
}
|
||||
}
|
||||
|
||||
void draft(
|
||||
const common_params_speculative & params,
|
||||
const llama_tokens & prompt_tgt,
|
||||
llama_token id_last,
|
||||
llama_tokens & result) override {
|
||||
GGML_UNUSED(params);
|
||||
|
||||
n_draft_last = 0;
|
||||
|
||||
const size_t cur_len = prompt_tgt.size();
|
||||
if (cur_len < mod.get_n()) {
|
||||
return;
|
||||
}
|
||||
|
||||
const size_t n = mod.get_n();
|
||||
|
||||
// add new ngrams in chunks
|
||||
if (i_last + 32 < cur_len) {
|
||||
for (size_t i = i_last; i < cur_len - n; ++i) {
|
||||
mod.add(prompt_tgt.data() + i);
|
||||
}
|
||||
|
||||
i_last = cur_len - n;
|
||||
}
|
||||
|
||||
result.resize(n + params.n_max);
|
||||
for (size_t i = 0; i < n - 1; ++i) {
|
||||
result[i] = prompt_tgt[cur_len - n + 1 + i];
|
||||
}
|
||||
result[n - 1] = id_last;
|
||||
|
||||
for (int i = 0; i < params.n_max; ++i) {
|
||||
const llama_token token = mod.get(result.data() + i);
|
||||
if (token == common_ngram_mod::EMPTY) {
|
||||
if (i < params.n_min) {
|
||||
result.clear();
|
||||
return;
|
||||
}
|
||||
|
||||
result.resize(n + i);
|
||||
break;
|
||||
}
|
||||
result[n + i] = token;
|
||||
}
|
||||
|
||||
// only return the m tokens that were drafted
|
||||
for (size_t i = 0; n + i < result.size(); ++i) {
|
||||
result[i] = result[n + i];
|
||||
}
|
||||
result.resize(result.size() - n);
|
||||
|
||||
// store length of drafted n‑gram for later acceptance analysis
|
||||
n_draft_last = result.size();
|
||||
}
|
||||
|
||||
void accept(uint16_t n_accepted) override {
|
||||
if (verbose) {
|
||||
LOG_INF("%s: accepted %d tokens from %zu drafted tokens\n", __func__, n_accepted, n_draft_last);
|
||||
}
|
||||
|
||||
// compute acceptance fraction if we have a recorded draft length
|
||||
if (n_draft_last > 0) {
|
||||
const double f_acc = (double)n_accepted / (double)n_draft_last;
|
||||
if (f_acc < 0.5) {
|
||||
n_low++;
|
||||
if (n_low >= 3) {
|
||||
LOG_WRN("%s: low acceptance streak (%d) – resetting ngram_mod\n", __func__, n_low);
|
||||
|
||||
mod.reset();
|
||||
n_low = 0;
|
||||
}
|
||||
} else {
|
||||
n_low = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct common_speculative_state_ngram_cache : public common_speculative_state {
|
||||
uint16_t n_draft;
|
||||
bool save_dynamic;
|
||||
@@ -650,6 +783,7 @@ std::string common_speculative_type_to_str(enum common_speculative_type type) {
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: return "ngram_simple";
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K: return "ngram_map_k";
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: return "ngram_map_k4v";
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: return "ngram_mod";
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_CACHE: return "ngram_cache";
|
||||
default: return "unknown";
|
||||
}
|
||||
@@ -666,8 +800,8 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
|
||||
// initialization of the speculative decoding system
|
||||
//
|
||||
common_speculative * common_speculative_init(
|
||||
const common_params_speculative & params,
|
||||
llama_context * ctx_tgt) {
|
||||
common_params_speculative & params,
|
||||
llama_context * ctx_tgt) {
|
||||
llama_context * ctx_dft = nullptr;
|
||||
if (params.model_dft) {
|
||||
ctx_dft = llama_init_from_model(params.model_dft, params.cparams_dft);
|
||||
@@ -687,6 +821,7 @@ common_speculative * common_speculative_init(
|
||||
bool has_ngram_simple = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE);
|
||||
bool has_ngram_map_k = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K);
|
||||
bool has_ngram_map_k4v = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V);
|
||||
bool has_ngram_mod = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MOD);
|
||||
|
||||
// In a more complex implementation we could use the same implementation but with different parameters.
|
||||
// This was initially used in PR-18471 but removed to simplify the code.
|
||||
@@ -701,6 +836,22 @@ common_speculative * common_speculative_init(
|
||||
// This implementation can guess tokens with high acceptance rate but is more expensive.
|
||||
configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, params));
|
||||
}
|
||||
if (has_ngram_mod) {
|
||||
// shared instance for all speculative decoding contexts
|
||||
if (!params.ngram_mod) {
|
||||
params.ngram_mod = std::make_shared<common_ngram_mod>(params.ngram_size_n, 4*1024*1024);
|
||||
|
||||
LOG_INF("%s: initialized ngram_mod with n=%d, size=%zu (%.3f MB)\n", __func__,
|
||||
params.ngram_size_n, params.ngram_mod->size(),
|
||||
(float)(params.ngram_mod->size_bytes())/1024/1024);
|
||||
|
||||
if (params.ngram_size_n < 16) {
|
||||
LOG_WRN("%s: ngram_mod n=%d is too small - poor quality is possible, see: https://github.com/ggml-org/llama.cpp/pull/19164\n", __func__, params.ngram_size_n);
|
||||
}
|
||||
}
|
||||
|
||||
configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_MOD, params));
|
||||
}
|
||||
if (has_ngram_cache) {
|
||||
configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, params));
|
||||
}
|
||||
@@ -758,6 +909,11 @@ common_speculative * common_speculative_init(
|
||||
));
|
||||
break;
|
||||
}
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: {
|
||||
GGML_ASSERT(config.params.ngram_mod);
|
||||
impls.push_back(std::make_unique<common_speculative_state_ngram_mod>(config.type, *config.params.ngram_mod));
|
||||
break;
|
||||
}
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_CACHE: {
|
||||
auto state = create_state_ngram_cache(
|
||||
params.lookup_cache_static, params.lookup_cache_dynamic, config);
|
||||
@@ -822,8 +978,7 @@ llama_tokens common_speculative_draft(
|
||||
|
||||
if (!result.empty()) {
|
||||
LOG_DBG("%s: called impl %s, hist size = %zu, call_count = %zu, gen = %zu\n", __func__,
|
||||
common_speculative_type_to_str(impl.get()->type).c_str(),
|
||||
prompt_tgt.size(),
|
||||
common_speculative_type_to_str(impl.get()->type).c_str(), prompt_tgt.size(),
|
||||
impl.get()->drafts_call_count, result.size());
|
||||
|
||||
spec->curr_impl = impl.get(); // set current implementation for stats
|
||||
@@ -869,6 +1024,7 @@ void common_speculative_print_stats(const common_speculative * spec) {
|
||||
str_perf = "";
|
||||
}
|
||||
|
||||
// TODO: report time for begin() and accept()
|
||||
LOG_INF("statistics %s: #calls = %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n",
|
||||
common_speculative_type_to_str(impl->type).c_str(),
|
||||
impl->drafts_call_count,
|
||||
|
||||
@@ -15,8 +15,8 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
|
||||
std::string common_speculative_type_to_str(enum common_speculative_type type);
|
||||
|
||||
common_speculative * common_speculative_init(
|
||||
const common_params_speculative & params,
|
||||
llama_context * ctx_tgt);
|
||||
common_params_speculative & params,
|
||||
llama_context * ctx_tgt);
|
||||
|
||||
void common_speculative_free(common_speculative * spec);
|
||||
|
||||
|
||||
@@ -8806,6 +8806,7 @@ class GraniteMoeModel(GraniteModel):
|
||||
gate, up = data_torch.split(ffn_dim, dim=-2)
|
||||
yield from ModelBase.modify_tensors(self, gate, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), bid)
|
||||
yield from ModelBase.modify_tensors(self, up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), bid)
|
||||
return
|
||||
|
||||
has_experts = bool(self.hparams.get('num_local_experts'))
|
||||
|
||||
|
||||
@@ -35,9 +35,9 @@ The following releases are verified and recommended:
|
||||
|
||||
|Commit ID|Tag|Release|Verified Platform| Update date|
|
||||
|-|-|-|-|-|
|
||||
|24e86cae7219b0f3ede1d5abdf5bf3ad515cccb8|b5377 |[llama-b5377-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b5377/llama-b5377-bin-win-sycl-x64.zip) |ArcB580/Linux/oneAPI 2025.1<br>LNL Arc GPU/Windows 11/oneAPI 2025.1.1|2025-05-15|
|
||||
|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
|
||||
|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||
|
||||
|24e86cae7219b0f3ede1d5abdf5bf3ad515cccb8|b5377 |[llama-b5377-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b5377/llama-b5377-bin-win-sycl-x64.zip) |Arc B580/Linux/oneAPI 2025.1<br>LNL Arc GPU/Windows 11/oneAPI 2025.1.1|2025-05-15|
|
||||
|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc A770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
|
||||
|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc A770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||
|
||||
|
||||
|
||||
## News
|
||||
@@ -51,7 +51,7 @@ The following releases are verified and recommended:
|
||||
|-|-|-|-|
|
||||
|PVC 1550|39|73|+87%|
|
||||
|Flex 170|39|50|+28%|
|
||||
|Arc770|42|55|+30%|
|
||||
|Arc A770|42|55|+30%|
|
||||
|MTL|13|16|+23%|
|
||||
|ARL-H|14|17|+21%|
|
||||
|
||||
@@ -62,7 +62,7 @@ The following releases are verified and recommended:
|
||||
- Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs.
|
||||
|
||||
- 2024.5
|
||||
- Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc770.
|
||||
- Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc A770.
|
||||
- Arch Linux is verified successfully.
|
||||
|
||||
- 2024.4
|
||||
@@ -111,7 +111,8 @@ On older Intel GPUs, you may try [OpenCL](/docs/backend/OPENCL.md) although the
|
||||
|-------------------------------|---------|---------------------------------------|
|
||||
| Intel Data Center Max Series | Support | Max 1550, 1100 |
|
||||
| Intel Data Center Flex Series | Support | Flex 170 |
|
||||
| Intel Arc Series | Support | Arc 770, 730M, Arc A750, B580 |
|
||||
| Intel Arc A-Series | Support | Arc A770, Arc A730M, Arc A750 |
|
||||
| Intel Arc B-Series | Support | Arc B580 |
|
||||
| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake, Arrow Lake, Lunar Lake |
|
||||
| Intel iGPU | Support | iGPU in 13700k, 13400, i5-1250P, i7-1260P, i7-1165G7 |
|
||||
|
||||
|
||||
@@ -1,5 +1,10 @@
|
||||
{
|
||||
"version": 4,
|
||||
"version": 5,
|
||||
"cmakeMinimumRequired": {
|
||||
"major": 3,
|
||||
"minor": 28,
|
||||
"patch": 0
|
||||
},
|
||||
"configurePresets": [
|
||||
{
|
||||
"name": "arm64-android-snapdragon",
|
||||
@@ -16,7 +21,9 @@
|
||||
"CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG",
|
||||
"CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
|
||||
"CMAKE_CXX_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
|
||||
"HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}",
|
||||
"CMAKE_PREFIX_PATH": "$env{OPENCL_SDK_ROOT}",
|
||||
"HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}",
|
||||
"HEXAGON_TOOLS_ROOT": "$env{HEXAGON_TOOLS_ROOT}",
|
||||
"PREBUILT_LIB_DIR": "android_aarch64",
|
||||
"GGML_OPENMP": "OFF",
|
||||
"GGML_LLAMAFILE": "OFF",
|
||||
@@ -31,7 +38,15 @@
|
||||
"name": "arm64-windows-snapdragon",
|
||||
"inherits": [ "base", "arm64-windows-llvm" ],
|
||||
"cacheVariables": {
|
||||
"HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}",
|
||||
"CMAKE_C_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE",
|
||||
"CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE",
|
||||
"CMAKE_C_FLAGS_RELEASE": "-O3 -DNDEBUG",
|
||||
"CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG",
|
||||
"CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
|
||||
"CMAKE_CXX_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
|
||||
"CMAKE_PREFIX_PATH": "$env{OPENCL_SDK_ROOT}",
|
||||
"HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}",
|
||||
"HEXAGON_TOOLS_ROOT": "$env{HEXAGON_TOOLS_ROOT}",
|
||||
"PREBUILT_LIB_DIR": "windows_aarch64",
|
||||
"GGML_OPENMP": "OFF",
|
||||
"GGML_LLAMAFILE": "OFF",
|
||||
@@ -1,6 +1,8 @@
|
||||
# Snapdragon-based Android devices
|
||||
# Snapdragon-based devices
|
||||
|
||||
## How to Build
|
||||
## Setup
|
||||
|
||||
### Android
|
||||
|
||||
The easiest way to build llama.cpp for a Snapdragon-based Android device is using the toolchain Docker image (see github.com/snapdragon-toolchain).
|
||||
This image includes Android NDK, OpenCL SDK, Hexagon SDK, CMake, etc.
|
||||
@@ -12,7 +14,24 @@ This method works on Linux, macOS, and Windows. macOS and Windows users should i
|
||||
[d]/> cd /workspace
|
||||
```
|
||||
|
||||
The rest of the Android build process assumes that you're running inside the toolchain container.
|
||||
Note: The rest of the **Android** build process assumes that you're running inside the toolchain container.
|
||||
|
||||
### Windows On Snapdragon
|
||||
|
||||
Native Windows 11 arm64 builds has the following tools dependencies:
|
||||
- MS Visual Studio 2026 (Community Edition or Pro)
|
||||
- MSVC arm64 standard and runtime libraries
|
||||
- UCRT and Driver Kit
|
||||
- LLVM core libraries and Clang compiler (winget)
|
||||
- CMake, Git, Python (winget)
|
||||
- Hexagon SDK Community Edition 6.4 or later (see windows.md)
|
||||
- OpenCL SDK 2.3 or later (see windows.md)
|
||||
|
||||
Note: The rest of the **Windows** build process assumes that you're running natively in Powershell.
|
||||
Adapt below build commands accordingly.
|
||||
|
||||
## How to Build
|
||||
|
||||
Let's build llama.cpp with CPU, OpenCL, and Hexagon backends via CMake presets:
|
||||
|
||||
```
|
||||
@@ -49,24 +68,26 @@ Preset CMake variables:
|
||||
To generate an installable "package" simply use cmake --install:
|
||||
|
||||
```
|
||||
[d]/workspace> cmake --install build-snapdragon --prefix pkg-adb/llama.cpp
|
||||
[d]/workspace> cmake --install build-snapdragon --prefix pkg-snapdragon/llama.cpp
|
||||
-- Install configuration: "Release"
|
||||
-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-cpu.so
|
||||
-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-opencl.so
|
||||
-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-hexagon.so
|
||||
-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v73.so
|
||||
-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v75.so
|
||||
-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v79.so
|
||||
-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v81.so
|
||||
-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml.so
|
||||
-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-cpu.so
|
||||
-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-opencl.so
|
||||
-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-hexagon.so
|
||||
-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-htp-v73.so
|
||||
-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-htp-v75.so
|
||||
-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-htp-v79.so
|
||||
-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-htp-v81.so
|
||||
-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml.so
|
||||
...
|
||||
-- Installing: /workspace/pkg-adb/llama.cpp/bin/llama-bench
|
||||
-- Installing: /workspace/pkg-adb/llama.cpp/bin/llama-cli
|
||||
-- Installing: /workspace/pkg-snapdragon/llama.cpp/bin/llama-bench
|
||||
-- Installing: /workspace/pkg-snapdragon/llama.cpp/bin/llama-cli
|
||||
...
|
||||
```
|
||||
|
||||
## How to Install
|
||||
|
||||
### Android
|
||||
|
||||
For this step, your device needs to be configured for on-device development.
|
||||
Please see https://developer.android.com/studio/debug/dev-options for details.
|
||||
|
||||
@@ -74,10 +95,10 @@ Once ADB is enabled, use `adb push` to install `pkg-snapdragon` on the device.
|
||||
**Note that the toolchain Docker image doesn't have ADB and doesn't set up the ADB bridge. Please use native ADB on the host.**
|
||||
|
||||
```
|
||||
~/src/llama.cpp$ adb push pkg-adb/llama.cpp /data/local/tmp/
|
||||
pkg-adb/llama.cpp/bin/: 67 files pushed, 0 skipped. 190.2 MB/s (919095042 bytes in 4.607s)
|
||||
pkg-adb/llama.cpp/include/: 19 files pushed, 0 skipped. 20.5 MB/s (255173 bytes in 0.012s)
|
||||
pkg-adb/llama.cpp/lib/: 16 files pushed, 0 skipped. 144.4 MB/s (43801382 bytes in 0.289s)
|
||||
~/src/llama.cpp$ adb push pkg-snapdragon/llama.cpp /data/local/tmp/
|
||||
pkg-snapdragon/llama.cpp/bin/: 67 files pushed, 0 skipped. 190.2 MB/s (919095042 bytes in 4.607s)
|
||||
pkg-snapdragon/llama.cpp/include/: 19 files pushed, 0 skipped. 20.5 MB/s (255173 bytes in 0.012s)
|
||||
pkg-snapdragon/llama.cpp/lib/: 16 files pushed, 0 skipped. 144.4 MB/s (43801382 bytes in 0.289s)
|
||||
102 files pushed, 0 skipped. 186.9 MB/s (963151597 bytes in 4.914s)
|
||||
```
|
||||
|
||||
@@ -92,6 +113,11 @@ At this point, you should also install some models:
|
||||
Llama-3.2-1B-Instruct-Q4_0.gguf: 1 file pushed, 0 skipped. 38.3 MB/s (773025920 bytes in 19.250s)
|
||||
```
|
||||
|
||||
### Windows
|
||||
|
||||
All artifacts are already installed in the `pkg-snapdragon` folder.
|
||||
To run, adapt below instructions to use Powershell scrits in `scripts/snapdragon/windows`.
|
||||
|
||||
## How to Run
|
||||
|
||||
The easiest way to run llama.cpp cli tools is using provided wrapper scripts that properly set up all required environment variables.
|
||||
161
docs/backend/snapdragon/windows.md
Normal file
161
docs/backend/snapdragon/windows.md
Normal file
@@ -0,0 +1,161 @@
|
||||
## Overview
|
||||
|
||||
The document covers procedures for installing the latest GPU and NPU drivers, and OpenCL and Hexagon SDKs.
|
||||
|
||||
|
||||
In order to use Hexagon NPU on Snapdragon Windows devices the underlying HTP Ops libraries (e.g libggml-htp-v73.so)
|
||||
must be included in the .cat file digitally signed with a trusted certificate.
|
||||
|
||||
This document covers details on how to generate personal certificate files (.pfx) and how to configure the system
|
||||
to allow for test signatures (aka test-signing).
|
||||
|
||||
## Install the latest Adreno OpenCL SDK
|
||||
|
||||
Either use the trimmed down version (optimized for CI) from
|
||||
|
||||
https://github.com/snapdragon-toolchain/opencl-sdk/releases/download/v2.3.2/adreno-opencl-sdk-v2.3.2-arm64-wos.tar.xz
|
||||
|
||||
Or download the complete official version from
|
||||
|
||||
https://softwarecenter.qualcomm.com/catalog/item/Adreno_OpenCL_SDK?version=2.3.2
|
||||
|
||||
Unzip/untar the archive into
|
||||
```
|
||||
c:\Qualcomm\OpenCL_SDK\2.3.2
|
||||
```
|
||||
|
||||
## Install the latest Hexagon SDK Community Edition
|
||||
|
||||
Either use the trimmed down version (optimized for CI) from
|
||||
|
||||
https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v6.4.0.2/hexagon-sdk-v6.4.0.2-arm64-wos.tar.xz
|
||||
|
||||
Or download the complete official version from
|
||||
|
||||
https://softwarecenter.qualcomm.com/catalog/item/Hexagon_SDK?version=6.4.0.2
|
||||
|
||||
Unzip/untar the archive into
|
||||
```
|
||||
c:\Qualcomm\Hexagon_SDK\6.4.0.2
|
||||
```
|
||||
|
||||
## Install the latest Adreno GPU driver
|
||||
|
||||
Download the driver from
|
||||
|
||||
https://softwarecenter.qualcomm.com/catalog/item/Windows_Graphics_Driver
|
||||
|
||||
After the automated installation and reboot please make sure that the GPU device shows up in the `Device Manager` (under 'Display Adapters`)
|
||||
|
||||
## Install the latest Qualcomm NPU driver
|
||||
|
||||
Download the driver from
|
||||
|
||||
https://softwarecenter.qualcomm.com/catalog/item/Qualcomm_HND
|
||||
|
||||
After the automated installation and reboot please make sure that the Hexagon NPU device shows up in the `Device Manager` (under `Neural Processors`).
|
||||
|
||||
If the device is not available you can try installing all components (`qcnspmcdm8380`, `qcnspmcdm8380_ext`) manually.
|
||||
The components are extracted into
|
||||
```
|
||||
c:\QCDrivers\qcnspmcdm...
|
||||
```
|
||||
|
||||
## Enable NPU driver test signatures
|
||||
|
||||
Please note that the following steps are required only for the Hexagon NPU.
|
||||
Adreno GPU backend does not require test signatures.
|
||||
|
||||
### Enable testsigning
|
||||
|
||||
Use `bcdedit` to enable test-signing
|
||||
```
|
||||
> bcdedit /set TESTSIGNING ON
|
||||
```
|
||||
(Secure Boot may need to be disabled for this to work)
|
||||
|
||||
Make sure test-signing is enabled after reboot
|
||||
```
|
||||
> bcdedit /enum
|
||||
...
|
||||
testsigning Yes
|
||||
...
|
||||
```
|
||||
For additional details see Microsoft guide at
|
||||
|
||||
https://learn.microsoft.com/en-us/windows-hardware/drivers/install/the-testsigning-boot-configuration-option
|
||||
|
||||
### Create personal certificate
|
||||
|
||||
The tools required for this procedure are available as part of Windows SDK and Windows Driver Kit which should be
|
||||
installed as part of the MS Visual Studio.
|
||||
They are typically located at
|
||||
```
|
||||
c:\Program Files (x86)\Windows Kits\10\bin\10.0.26100.0
|
||||
```
|
||||
(replace 10.0.26100.0 with correct version).
|
||||
|
||||
To create personal self-signed certificate run the following commands (either from cmd or power-shell):
|
||||
```
|
||||
> cd c:\Users\MyUser
|
||||
> mkdir Certs
|
||||
> cd Certs
|
||||
> makecert -r -pe -ss PrivateCertStore -n CN=GGML.HTP.v1 -eku 1.3.6.1.5.5.7.3.3 -sv ggml-htp-v1.pvk ggml-htp-v1.cer
|
||||
> pvk2pfx.exe -pvk ggml-htp-v1.pvk -spc ggml-htp-v1.cer -pfx ggml-htp-v1.pfx
|
||||
```
|
||||
(replace `MyUser` with your username).
|
||||
|
||||
Add this certificate to `Trusted Root Certification Authorities` and `Trusted Publishers` stores.
|
||||
This can be done using `certlm` Certificate Manager tool.
|
||||
Right click on the certificate store, select `All Tasks -> Import` and follow the prompts to import the certificate from the
|
||||
PFX file you created above.
|
||||
|
||||
For additional details see Microsoft guide at
|
||||
|
||||
https://learn.microsoft.com/en-us/windows-hardware/drivers/install/introduction-to-test-signing
|
||||
|
||||
Make sure to save the PFX file, you will need it for the build procedures.
|
||||
Please note that the same certificate can be used for signing any number of builds.
|
||||
|
||||
## Build Hexagon backend with signed HTP ops libraries
|
||||
|
||||
The overall Hexagon backend build procedure for Windows on Snapdragon is the same as for other platforms.
|
||||
However, additional settings are required for generating and signing HTP Ops libraries.
|
||||
```
|
||||
> $env:OPENCL_SDK_ROOT="C:\Qualcomm\OpenCL_SDK\2.3.2"
|
||||
> $env:HEXAGON_SDK_ROOT="C:\Qualcomm\Hexagon_SDK\6.4.0.2"
|
||||
> $env:HEXAGON_TOOLS_ROOT="C:\Qualcomm\Hexagon_SDK\6.4.0.2\tools\HEXAGON_Tools\19.0.04"
|
||||
> $env:HEXAGON_HTP_CERT="c:\Users\MyUsers\Certs\ggml-htp-v1.pfx"
|
||||
> $env:WINDOWS_SDK_BIN="C:\Program Files (x86)\Windows Kits\10\bin\10.0.26100.0\arm64"
|
||||
|
||||
> cmake --preset arm64-windows-snapdragon -B build-wos
|
||||
...
|
||||
> cmake --install build-wos --prefix pkg-snapdragon
|
||||
```
|
||||
|
||||
Once the build is complete HTP ops libraries will be installed like this
|
||||
```
|
||||
> dir pkg-snapdragon/lib
|
||||
...
|
||||
-a---- 1/22/2026 6:01 PM 187656 libggml-htp-v73.so
|
||||
-a---- 1/22/2026 6:01 PM 191752 libggml-htp-v75.so
|
||||
-a---- 1/22/2026 6:01 PM 187656 libggml-htp-v79.so
|
||||
-a---- 1/22/2026 6:01 PM 187656 libggml-htp-v81.so
|
||||
-a---- 1/22/2026 6:01 PM 4139 libggml-htp.cat
|
||||
```
|
||||
|
||||
The .cat file, the signature and proper certicate installation can be verified with
|
||||
|
||||
```
|
||||
> signtool.exe verify /v /pa .\pkg-snapdragon\lib\libggml-htp.cat
|
||||
Verifying: .\pkg-snapdragon\lib\libggml-htp.cat
|
||||
|
||||
Signature Index: 0 (Primary Signature)
|
||||
Hash of file (sha256): 9820C664DA59D5EAE31DBB664127FCDAEF59CDC31502496BC567544EC2F401CF
|
||||
|
||||
Signing Certificate Chain:
|
||||
Issued to: GGML.HTP.v1
|
||||
...
|
||||
Successfully verified: .\pkg-snapdragon\lib\libggml-htp.cat
|
||||
...
|
||||
```
|
||||
@@ -97,7 +97,7 @@ Legend:
|
||||
| SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
| SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||
| SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
||||
| SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
||||
@@ -114,7 +114,7 @@ Legend:
|
||||
| TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
| TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
||||
| XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
|
||||
@@ -29,8 +29,8 @@
|
||||
"SYCL0","EXP","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","EXPM1","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","SYCL"
|
||||
"SYCL0","EXPM1","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","SYCL"
|
||||
"SYCL0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","SYCL"
|
||||
"SYCL0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","SYCL"
|
||||
"SYCL0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","GELU_ERF","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","FLOOR","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
|
||||
@@ -71,8 +71,8 @@
|
||||
"SYCL0","EXP","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","EXPM1","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
|
||||
"SYCL0","EXPM1","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
|
||||
"SYCL0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
|
||||
"SYCL0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
|
||||
"SYCL0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","GELU_ERF","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","FLOOR","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
|
||||
@@ -113,8 +113,8 @@
|
||||
"SYCL0","EXP","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","EXPM1","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","SYCL"
|
||||
"SYCL0","EXPM1","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","SYCL"
|
||||
"SYCL0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","SYCL"
|
||||
"SYCL0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","SYCL"
|
||||
"SYCL0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","GELU_ERF","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","GELU_ERF","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","FLOOR","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
|
||||
@@ -155,8 +155,8 @@
|
||||
"SYCL0","EXP","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","EXPM1","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
|
||||
"SYCL0","EXPM1","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
|
||||
"SYCL0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
|
||||
"SYCL0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
|
||||
"SYCL0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","GELU_ERF","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","GELU_ERF","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","FLOOR","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
|
||||
@@ -10052,10 +10052,10 @@
|
||||
"SYCL0","CUMSUM","type=f32,ne=[375960,1,1,1]","support","0","no","SYCL"
|
||||
"SYCL0","CUMSUM","type=f32,ne=[20481,4,1,1]","support","0","no","SYCL"
|
||||
"SYCL0","XIELU","type=f32,ne=[10,5,4,3]","support","0","no","SYCL"
|
||||
"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","0","no","SYCL"
|
||||
"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","0","no","SYCL"
|
||||
"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","0","no","SYCL"
|
||||
"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","0","no","SYCL"
|
||||
"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","1","yes","SYCL"
|
||||
"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","1","yes","SYCL"
|
||||
"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","1","yes","SYCL"
|
||||
"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","1","yes","SYCL"
|
||||
"SYCL0","FILL","type=f32,ne=[10,10,4,3],c=0.000000","support","0","no","SYCL"
|
||||
"SYCL0","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","0","no","SYCL"
|
||||
"SYCL0","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","0","no","SYCL"
|
||||
|
||||
|
Can't render this file because it is too large.
|
2
examples/compare-mlx/.gitignore
vendored
Normal file
2
examples/compare-mlx/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
*.txt
|
||||
*/
|
||||
3
examples/compare-mlx/cmd.sh
Executable file
3
examples/compare-mlx/cmd.sh
Executable file
@@ -0,0 +1,3 @@
|
||||
#!/bin/bash
|
||||
|
||||
./compare-mlx.sh --raw-path ../../build/wikitext-2-raw/wiki.test.raw --no-keep
|
||||
707
examples/compare-mlx/compare-mlx.sh
Executable file
707
examples/compare-mlx/compare-mlx.sh
Executable file
@@ -0,0 +1,707 @@
|
||||
#!/bin/bash
|
||||
|
||||
# a script to compare MLX and GGUF models
|
||||
#
|
||||
# usage:
|
||||
# ./compare-mlx.sh --raw-path wiki.test.raw --no-keep
|
||||
#
|
||||
# TODOs
|
||||
# - add QAT evals
|
||||
|
||||
# check if LLAMA_HOME_DIR is set
|
||||
if [[ -z "$LLAMA_HOME_DIR" ]]; then
|
||||
lcpp_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")"/../../ && pwd)
|
||||
else
|
||||
lcpp_dir="${LLAMA_HOME_DIR}"
|
||||
fi
|
||||
|
||||
echo "Using llama.cpp directory: ${lcpp_dir}"
|
||||
|
||||
# check for convert_hf_to_gguf.py
|
||||
if [[ ! -f "${lcpp_dir}/convert_hf_to_gguf.py" ]]; then
|
||||
echo "convert_hf_to_gguf.py not found in ${lcpp_dir}"
|
||||
echo "Set a LLAMA_HOME_DIR environment variable to point to your llama.cpp directory"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
set -x
|
||||
|
||||
# sanity checks that all Python dependencies are installed
|
||||
if ! python -c "import mlx.core"; then
|
||||
echo "MLX not found. Please install MLX"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if ! python ${lcpp_dir}/convert_hf_to_gguf.py --help; then
|
||||
echo "convert_hf_to_gguf.py not working. Please install llama.cpp python requirements"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# by default use the system binaries (for example from brew)
|
||||
llama_perplexity="llama-perplexity"
|
||||
|
||||
if [[ ! -z "$LLAMA_PERPLEXITY" ]]; then
|
||||
llama_perplexity="$LLAMA_PERPLEXITY"
|
||||
fi
|
||||
|
||||
echo "Using llama-perplexity: ${llama_perplexity}"
|
||||
|
||||
if ! command -v "$llama_perplexity" &> /dev/null; then
|
||||
echo "llama-perplexity not found. Please install it."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
llama_quantize="llama-quantize"
|
||||
|
||||
if [[ ! -z "$LLAMA_QUANTIZE" ]]; then
|
||||
llama_quantize="$LLAMA_QUANTIZE"
|
||||
fi
|
||||
|
||||
echo "Using llama-quantize: ${llama_quantize}"
|
||||
|
||||
if ! command -v "$llama_quantize" &> /dev/null; then
|
||||
echo "llama-quantize not found. Please install it."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
llama_batched_bench="llama-batched-bench"
|
||||
|
||||
if [[ ! -z "$LLAMA_BATCHED_BENCH" ]]; then
|
||||
llama_batched_bench="$LLAMA_BATCHED_BENCH"
|
||||
fi
|
||||
|
||||
echo "Using llama-batched-bench: ${llama_batched_bench}"
|
||||
|
||||
if ! command -v "$llama_batched_bench" &> /dev/null; then
|
||||
echo "llama-batched-bench not found. Please install it."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# get the size in GiB
|
||||
get_size() {
|
||||
local path="$1"
|
||||
local bytes=$(du -s "$path" | awk '{print $1}')
|
||||
local res=$(echo "scale=3; ($bytes*512)/1024/1024/1024" | bc)
|
||||
echo "$res"
|
||||
}
|
||||
|
||||
# parameters:
|
||||
# --no-compute : do not compute anything, just summarize the existing results
|
||||
# --no-ppl : do not compute ppl
|
||||
# --no-perf : do not compute performance (speed) metrics
|
||||
# --no-keep : delete intermediate model files
|
||||
# --num-samples : number of text samples to evaluate (default: 512)
|
||||
# --sequence-length : sequence length of the samples in tokens (default: 512)
|
||||
# --raw-path : file with raw text (such as wikitext)
|
||||
|
||||
# extra agruments
|
||||
args_lcpp="-t 1"
|
||||
|
||||
num_samples=512
|
||||
sequence_length=512
|
||||
raw_path=""
|
||||
no_compute=false
|
||||
no_ppl=false
|
||||
no_perf=false
|
||||
no_keep=false
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--no-compute)
|
||||
no_compute=true
|
||||
shift
|
||||
;;
|
||||
--no-ppl)
|
||||
no_ppl=true
|
||||
shift
|
||||
;;
|
||||
--no-perf)
|
||||
no_perf=true
|
||||
shift
|
||||
;;
|
||||
--no-keep)
|
||||
no_keep=true
|
||||
shift
|
||||
;;
|
||||
--num-samples)
|
||||
num_samples="$2"
|
||||
shift 2
|
||||
;;
|
||||
--sequence-length)
|
||||
sequence_length="$2"
|
||||
shift 2
|
||||
;;
|
||||
--raw-path)
|
||||
raw_path="$2"
|
||||
shift 2
|
||||
;;
|
||||
*)
|
||||
echo "Unknown parameter: $1"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ -z "$raw_path" ]]; then
|
||||
echo "No raw path provided"
|
||||
echo "Recommended to use the test set of WikiText from here: https://github.com/ggml-org/llama.cpp/blob/master/scripts/get-wikitext-2.sh"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
eval_model() {
|
||||
org="$1"
|
||||
mid="$2"
|
||||
|
||||
echo "Evaluating ${org}/${mid}"
|
||||
|
||||
huggingface-cli download ${org}/${mid} --local-dir ${org}/${mid}
|
||||
|
||||
# generate and process MLX models
|
||||
|
||||
if [[ "$no_compute" == true ]]; then
|
||||
echo "Skipping computation"
|
||||
else
|
||||
rm -rfv ./${mid}-f32-mlx
|
||||
mlx_lm.convert --hf ./${org}/${mid} --mlx-path ./${mid}-f32-mlx --dtype float32
|
||||
get_size ./${mid}-f32-mlx > ./${mid}-f32-mlx-size.txt
|
||||
|
||||
if [[ "$no_ppl" == false ]]; then
|
||||
python ./mlx-ppl.py --model ./${mid}-f32-mlx --raw-path "$raw_path" --num-samples "$num_samples" --sequence-length "$sequence_length" 2>&1 | tee ./${mid}-f32-mlx-ppl.txt
|
||||
fi
|
||||
|
||||
# no need for F32 perf benchmarks
|
||||
#if [[ "$no_perf" == false ]]; then
|
||||
# mlx_lm.benchmark --model ./${mid}-f32-mlx -p 2048 -g 128 --num-trials 1 2>&1 | tee ./${mid}-f32-mlx-perf-2048.txt
|
||||
# mlx_lm.benchmark --model ./${mid}-f32-mlx -p 4096 -g 128 --num-trials 1 2>&1 | tee ./${mid}-f32-mlx-perf-4096.txt
|
||||
# mlx_lm.benchmark --model ./${mid}-f32-mlx -p 8192 -g 128 --num-trials 1 2>&1 | tee ./${mid}-f32-mlx-perf-8192.txt
|
||||
# mlx_lm.benchmark --model ./${mid}-f32-mlx -p 16384 -g 128 --num-trials 1 2>&1 | tee ./${mid}-f32-mlx-perf-16384.txt
|
||||
# mlx_lm.benchmark --model ./${mid}-f32-mlx -p 32768 -g 128 --num-trials 1 2>&1 | tee ./${mid}-f32-mlx-perf-32768.txt
|
||||
#fi
|
||||
|
||||
if [[ "$no_keep" == true ]]; then
|
||||
echo "Deleting intermediate model files"
|
||||
rm -rfv ./${mid}-f32-mlx
|
||||
fi
|
||||
|
||||
rm -rfv ./${mid}-bf16-mlx
|
||||
mlx_lm.convert --hf ./${org}/${mid} --mlx-path ./${mid}-bf16-mlx --dtype bfloat16
|
||||
get_size ./${mid}-bf16-mlx > ./${mid}-bf16-mlx-size.txt
|
||||
|
||||
if [[ "$no_ppl" == false ]]; then
|
||||
python ./mlx-ppl.py --model ./${mid}-bf16-mlx --raw-path "$raw_path" --num-samples "$num_samples" --sequence-length "$sequence_length" 2>&1 | tee ./${mid}-bf16-mlx-ppl.txt
|
||||
fi
|
||||
|
||||
if [[ "$no_perf" == false ]]; then
|
||||
mlx_lm.benchmark --model ./${mid}-bf16-mlx -p 2048 -g 128 --num-trials 1 2>&1 | tee ./${mid}-bf16-mlx-perf-2048.txt
|
||||
mlx_lm.benchmark --model ./${mid}-bf16-mlx -p 4096 -g 128 --num-trials 1 2>&1 | tee ./${mid}-bf16-mlx-perf-4096.txt
|
||||
mlx_lm.benchmark --model ./${mid}-bf16-mlx -p 8192 -g 128 --num-trials 1 2>&1 | tee ./${mid}-bf16-mlx-perf-8192.txt
|
||||
mlx_lm.benchmark --model ./${mid}-bf16-mlx -p 16384 -g 128 --num-trials 1 2>&1 | tee ./${mid}-bf16-mlx-perf-16384.txt
|
||||
mlx_lm.benchmark --model ./${mid}-bf16-mlx -p 32768 -g 128 --num-trials 1 2>&1 | tee ./${mid}-bf16-mlx-perf-32768.txt
|
||||
fi
|
||||
|
||||
if [[ "$no_keep" == true ]]; then
|
||||
echo "Deleting intermediate model files"
|
||||
rm -rfv ./${mid}-bf16-mlx
|
||||
fi
|
||||
|
||||
rm -rfv ./${mid}-f16-mlx
|
||||
mlx_lm.convert --hf ./${org}/${mid} --mlx-path ./${mid}-f16-mlx --dtype float16
|
||||
get_size ./${mid}-f16-mlx > ./${mid}-f16-mlx-size.txt
|
||||
|
||||
if [[ "$no_ppl" == false ]]; then
|
||||
python ./mlx-ppl.py --model ./${mid}-f16-mlx --raw-path "$raw_path" --num-samples "$num_samples" --sequence-length "$sequence_length" 2>&1 | tee ./${mid}-f16-mlx-ppl.txt
|
||||
fi
|
||||
|
||||
if [[ "$no_perf" == false ]]; then
|
||||
mlx_lm.benchmark --model ./${mid}-f16-mlx -p 2048 -g 128 --num-trials 1 2>&1 | tee ./${mid}-f16-mlx-perf-2048.txt
|
||||
mlx_lm.benchmark --model ./${mid}-f16-mlx -p 4096 -g 128 --num-trials 1 2>&1 | tee ./${mid}-f16-mlx-perf-4096.txt
|
||||
mlx_lm.benchmark --model ./${mid}-f16-mlx -p 8192 -g 128 --num-trials 1 2>&1 | tee ./${mid}-f16-mlx-perf-8192.txt
|
||||
mlx_lm.benchmark --model ./${mid}-f16-mlx -p 16384 -g 128 --num-trials 1 2>&1 | tee ./${mid}-f16-mlx-perf-16384.txt
|
||||
mlx_lm.benchmark --model ./${mid}-f16-mlx -p 32768 -g 128 --num-trials 1 2>&1 | tee ./${mid}-f16-mlx-perf-32768.txt
|
||||
fi
|
||||
|
||||
if [[ "$no_keep" == true ]]; then
|
||||
echo "Deleting intermediate model files"
|
||||
rm -rfv ./${mid}-f16-mlx
|
||||
fi
|
||||
|
||||
rm -rfv ./${mid}-q8-mlx
|
||||
mlx_lm.convert --hf ./${org}/${mid} --mlx-path ./${mid}-q8-mlx --quantize --q-bits 8 --dtype float16
|
||||
get_size ./${mid}-q8-mlx > ./${mid}-q8-mlx-size.txt
|
||||
|
||||
if [[ "$no_ppl" == false ]]; then
|
||||
python ./mlx-ppl.py --model ./${mid}-q8-mlx --raw-path "$raw_path" --num-samples "$num_samples" --sequence-length "$sequence_length" 2>&1 | tee ./${mid}-q8-mlx-ppl.txt
|
||||
fi
|
||||
|
||||
if [[ "$no_perf" == false ]]; then
|
||||
mlx_lm.benchmark --model ./${mid}-q8-mlx -p 2048 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q8-mlx-perf-2048.txt
|
||||
mlx_lm.benchmark --model ./${mid}-q8-mlx -p 4096 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q8-mlx-perf-4096.txt
|
||||
mlx_lm.benchmark --model ./${mid}-q8-mlx -p 8192 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q8-mlx-perf-8192.txt
|
||||
mlx_lm.benchmark --model ./${mid}-q8-mlx -p 16384 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q8-mlx-perf-16384.txt
|
||||
mlx_lm.benchmark --model ./${mid}-q8-mlx -p 32768 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q8-mlx-perf-32768.txt
|
||||
fi
|
||||
|
||||
if [[ "$no_keep" == true ]]; then
|
||||
echo "Deleting intermediate model files"
|
||||
rm -rfv ./${mid}-q8-mlx
|
||||
fi
|
||||
|
||||
#rm -rfv ./${mid}-q6-mlx
|
||||
#mlx_lm.convert --hf ./${org}/${mid} --mlx-path ./${mid}-q6-mlx --quantize --q-bits 6 --dtype float16
|
||||
#get_size ./${mid}-q6-mlx > ./${mid}-q6-mlx-size.txt
|
||||
|
||||
#if [[ "$no_ppl" == false ]]; then
|
||||
# python ./mlx-ppl.py --model ./${mid}-q6-mlx --raw-path "$raw_path" --num-samples "$num_samples" --sequence-length "$sequence_length" 2>&1 | tee ./${mid}-q6-mlx-ppl.txt
|
||||
#fi
|
||||
|
||||
#if [[ "$no_perf" == false ]]; then
|
||||
# mlx_lm.benchmark --model ./${mid}-q6-mlx -p 2048 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q6-mlx-perf-2048.txt
|
||||
# mlx_lm.benchmark --model ./${mid}-q6-mlx -p 4096 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q6-mlx-perf-4096.txt
|
||||
# mlx_lm.benchmark --model ./${mid}-q6-mlx -p 8192 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q6-mlx-perf-8192.txt
|
||||
# mlx_lm.benchmark --model ./${mid}-q6-mlx -p 16384 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q6-mlx-perf-16384.txt
|
||||
# mlx_lm.benchmark --model ./${mid}-q6-mlx -p 32768 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q6-mlx-perf-32768.txt
|
||||
#fi
|
||||
|
||||
#if [[ "$no_keep" == true ]]; then
|
||||
# echo "Deleting intermediate model files"
|
||||
# rm -rfv ./${mid}-q6-mlx
|
||||
#fi
|
||||
|
||||
#rm -rfv ./${mid}-q5-mlx
|
||||
#mlx_lm.convert --hf ./${org}/${mid} --mlx-path ./${mid}-q5-mlx --quantize --q-bits 5 --dtype float16
|
||||
#get_size ./${mid}-q5-mlx > ./${mid}-q5-mlx-size.txt
|
||||
|
||||
#if [[ "$no_ppl" == false ]]; then
|
||||
# python ./mlx-ppl.py --model ./${mid}-q5-mlx --raw-path "$raw_path" --num-samples "$num_samples" --sequence-length "$sequence_length" 2>&1 | tee ./${mid}-q5-mlx-ppl.txt
|
||||
#fi
|
||||
|
||||
#if [[ "$no_perf" == false ]]; then
|
||||
# mlx_lm.benchmark --model ./${mid}-q5-mlx -p 2048 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q5-mlx-perf-2048.txt
|
||||
# mlx_lm.benchmark --model ./${mid}-q5-mlx -p 4096 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q5-mlx-perf-4096.txt
|
||||
# mlx_lm.benchmark --model ./${mid}-q5-mlx -p 8192 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q5-mlx-perf-8192.txt
|
||||
# mlx_lm.benchmark --model ./${mid}-q5-mlx -p 16384 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q5-mlx-perf-16384.txt
|
||||
# mlx_lm.benchmark --model ./${mid}-q5-mlx -p 32768 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q5-mlx-perf-32768.txt
|
||||
#fi
|
||||
|
||||
#if [[ "$no_keep" == true ]]; then
|
||||
# echo "Deleting intermediate model files"
|
||||
# rm -rfv ./${mid}-q5-mlx
|
||||
#fi
|
||||
|
||||
# I think this is something similar to q4_k
|
||||
rm -rfv ./${mid}-q4p-mlx
|
||||
mlx_lm.convert --hf ./${org}/${mid} --mlx-path ./${mid}-q4p-mlx --quantize --quant-predicate mixed_4_6 --dtype float16
|
||||
get_size ./${mid}-q4p-mlx > ./${mid}-q4p-mlx-size.txt
|
||||
|
||||
if [[ "$no_ppl" == false ]]; then
|
||||
python ./mlx-ppl.py --model ./${mid}-q4p-mlx --raw-path "$raw_path" --num-samples "$num_samples" --sequence-length "$sequence_length" 2>&1 | tee ./${mid}-q4p-mlx-ppl.txt
|
||||
fi
|
||||
|
||||
if [[ "$no_perf" == false ]]; then
|
||||
mlx_lm.benchmark --model ./${mid}-q4p-mlx -p 2048 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q4p-mlx-perf-2048.txt
|
||||
mlx_lm.benchmark --model ./${mid}-q4p-mlx -p 4096 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q4p-mlx-perf-4096.txt
|
||||
mlx_lm.benchmark --model ./${mid}-q4p-mlx -p 8192 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q4p-mlx-perf-8192.txt
|
||||
mlx_lm.benchmark --model ./${mid}-q4p-mlx -p 16384 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q4p-mlx-perf-16384.txt
|
||||
mlx_lm.benchmark --model ./${mid}-q4p-mlx -p 32768 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q4p-mlx-perf-32768.txt
|
||||
fi
|
||||
|
||||
if [[ "$no_keep" == true ]]; then
|
||||
echo "Deleting intermediate model files"
|
||||
rm -rfv ./${mid}-q4p-mlx
|
||||
fi
|
||||
|
||||
rm -rfv ./${mid}-q4-mlx
|
||||
mlx_lm.convert --hf ./${org}/${mid} --mlx-path ./${mid}-q4-mlx --quantize --q-bits 4 --dtype float16
|
||||
get_size ./${mid}-q4-mlx > ./${mid}-q4-mlx-size.txt
|
||||
|
||||
if [[ "$no_ppl" == false ]]; then
|
||||
python ./mlx-ppl.py --model ./${mid}-q4-mlx --raw-path "$raw_path" --num-samples "$num_samples" --sequence-length "$sequence_length" 2>&1 | tee ./${mid}-q4-mlx-ppl.txt
|
||||
fi
|
||||
|
||||
if [[ "$no_perf" == false ]]; then
|
||||
mlx_lm.benchmark --model ./${mid}-q4-mlx -p 2048 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q4-mlx-perf-2048.txt
|
||||
mlx_lm.benchmark --model ./${mid}-q4-mlx -p 4096 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q4-mlx-perf-4096.txt
|
||||
mlx_lm.benchmark --model ./${mid}-q4-mlx -p 8192 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q4-mlx-perf-8192.txt
|
||||
mlx_lm.benchmark --model ./${mid}-q4-mlx -p 16384 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q4-mlx-perf-16384.txt
|
||||
mlx_lm.benchmark --model ./${mid}-q4-mlx -p 32768 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q4-mlx-perf-32768.txt
|
||||
fi
|
||||
|
||||
if [[ "$no_keep" == true ]]; then
|
||||
echo "Deleting intermediate model files"
|
||||
rm -rfv ./${mid}-q4-mlx
|
||||
fi
|
||||
|
||||
rm -rfv ./${mid}-q3-mlx
|
||||
mlx_lm.convert --hf ./${org}/${mid} --mlx-path ./${mid}-q3-mlx --quantize --q-bits 3 --dtype float16
|
||||
get_size ./${mid}-q3-mlx > ./${mid}-q3-mlx-size.txt
|
||||
|
||||
if [[ "$no_ppl" == false ]]; then
|
||||
python ./mlx-ppl.py --model ./${mid}-q3-mlx --raw-path "$raw_path" --num-samples "$num_samples" --sequence-length "$sequence_length" 2>&1 | tee ./${mid}-q3-mlx-ppl.txt
|
||||
fi
|
||||
|
||||
if [[ "$no_perf" == false ]]; then
|
||||
mlx_lm.benchmark --model ./${mid}-q3-mlx -p 2048 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q3-mlx-perf-2048.txt
|
||||
mlx_lm.benchmark --model ./${mid}-q3-mlx -p 4096 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q3-mlx-perf-4096.txt
|
||||
mlx_lm.benchmark --model ./${mid}-q3-mlx -p 8192 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q3-mlx-perf-8192.txt
|
||||
mlx_lm.benchmark --model ./${mid}-q3-mlx -p 16384 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q3-mlx-perf-16384.txt
|
||||
mlx_lm.benchmark --model ./${mid}-q3-mlx -p 32768 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q3-mlx-perf-32768.txt
|
||||
fi
|
||||
|
||||
if [[ "$no_keep" == true ]]; then
|
||||
echo "Deleting intermediate model files"
|
||||
rm -rfv ./${mid}-q3-mlx
|
||||
fi
|
||||
fi
|
||||
|
||||
# generate and process llama.cpp GGUF models
|
||||
|
||||
if [[ "$no_compute" == true ]]; then
|
||||
echo "Skipping computation"
|
||||
else
|
||||
# the F32 model is the reference - we generate all other models from it
|
||||
mkdir -p ./${mid}-f32-gguf
|
||||
python ${lcpp_dir}/convert_hf_to_gguf.py ./${org}/${mid} --outtype f32 --outfile ./${mid}-f32-gguf/model.gguf
|
||||
get_size ./${mid}-f32-gguf > ./${mid}-f32-gguf-size.txt
|
||||
|
||||
if [[ "$no_ppl" == false ]]; then
|
||||
${llama_perplexity} $args_lcpp -m ./${mid}-f32-gguf/model.gguf -f "$raw_path" --chunks "${num_samples}" -c "${sequence_length}" 2>&1 | tee ./${mid}-f32-gguf-ppl.txt
|
||||
fi
|
||||
|
||||
# no need for F32 perf benchmarks
|
||||
#if [[ "$no_perf" == false ]]; then
|
||||
# ${llama_batched_bench} $args_lcpp -m ./${mid}-f32-gguf/model.gguf -c 33768 -b 2048 -ub 2048 -npp 2048,4096,8192,16384,32768 -ntg 128 -npl 1 2>&1 | tee ./${mid}-f32-gguf-perf.txt
|
||||
#fi
|
||||
|
||||
# this requires to explicitly build llama.cpp with BF16 support
|
||||
rm -rfv ./${mid}-bf16-gguf && mkdir -p ./${mid}-bf16-gguf
|
||||
${llama_quantize} ./${mid}-f32-gguf/model.gguf ./${mid}-bf16-gguf/model.gguf bf16
|
||||
get_size ./${mid}-bf16-gguf > ./${mid}-bf16-gguf-size.txt
|
||||
|
||||
if [[ "$no_ppl" == false ]]; then
|
||||
${llama_perplexity} $args_lcpp -m ./${mid}-bf16-gguf/model.gguf -f "$raw_path" --chunks "${num_samples}" -c "${sequence_length}" 2>&1 | tee ./${mid}-bf16-gguf-ppl.txt
|
||||
fi
|
||||
|
||||
if [[ "$no_perf" == false ]]; then
|
||||
${llama_batched_bench} $args_lcpp -m ./${mid}-bf16-gguf/model.gguf -c 33768 -b 2048 -ub 2048 -npp 2048,4096,8192,16384,32768 -ntg 128 -npl 1 2>&1 | tee ./${mid}-bf16-gguf-perf.txt
|
||||
fi
|
||||
|
||||
if [[ "$no_keep" == true ]]; then
|
||||
echo "Deleting intermediate model files"
|
||||
rm -rfv ./${mid}-bf16-gguf
|
||||
fi
|
||||
|
||||
rm -rfv ./${mid}-f16-gguf && mkdir -p ./${mid}-f16-gguf
|
||||
${llama_quantize} ./${mid}-f32-gguf/model.gguf ./${mid}-f16-gguf/model.gguf f16
|
||||
get_size ./${mid}-f16-gguf > ./${mid}-f16-gguf-size.txt
|
||||
|
||||
if [[ "$no_ppl" == false ]]; then
|
||||
${llama_perplexity} $args_lcpp -m ./${mid}-f16-gguf/model.gguf -f "$raw_path" --chunks "${num_samples}" -c "${sequence_length}" 2>&1 | tee ./${mid}-f16-gguf-ppl.txt
|
||||
fi
|
||||
|
||||
if [[ "$no_perf" == false ]]; then
|
||||
${llama_batched_bench} $args_lcpp -m ./${mid}-f16-gguf/model.gguf -c 33768 -b 2048 -ub 2048 -npp 2048,4096,8192,16384,32768 -ntg 128 -npl 1 2>&1 | tee ./${mid}-f16-gguf-perf.txt
|
||||
fi
|
||||
|
||||
if [[ "$no_keep" == true ]]; then
|
||||
echo "Deleting intermediate model files"
|
||||
rm -rfv ./${mid}-f16-gguf
|
||||
fi
|
||||
|
||||
rm -rfv ./${mid}-q8-gguf && mkdir -p ./${mid}-q8-gguf
|
||||
${llama_quantize} ./${mid}-f32-gguf/model.gguf ./${mid}-q8-gguf/model.gguf q8_0
|
||||
get_size ./${mid}-q8-gguf > ./${mid}-q8-gguf-size.txt
|
||||
|
||||
if [[ "$no_ppl" == false ]]; then
|
||||
${llama_perplexity} $args_lcpp -m ./${mid}-q8-gguf/model.gguf -f "$raw_path" --chunks "${num_samples}" -c "${sequence_length}" 2>&1 | tee ./${mid}-q8-gguf-ppl.txt
|
||||
fi
|
||||
|
||||
if [[ "$no_perf" == false ]]; then
|
||||
${llama_batched_bench} $args_lcpp -m ./${mid}-q8-gguf/model.gguf -c 33768 -b 2048 -ub 2048 -npp 2048,4096,8192,16384,32768 -ntg 128 -npl 1 2>&1 | tee ./${mid}-q8-gguf-perf.txt
|
||||
fi
|
||||
|
||||
if [[ "$no_keep" == true ]]; then
|
||||
echo "Deleting intermediate model files"
|
||||
rm -rfv ./${mid}-q8-gguf
|
||||
fi
|
||||
|
||||
#rm -rfv ./${mid}-q6-gguf && mkdir -p ./${mid}-q6-gguf
|
||||
#${llama_quantize} ./${mid}-f32-gguf/model.gguf ./${mid}-q6-gguf/model.gguf q6_k
|
||||
#get_size ./${mid}-q6-gguf > ./${mid}-q6-gguf-size.txt
|
||||
|
||||
#if [[ "$no_ppl" == false ]]; then
|
||||
# ${llama_perplexity} $args_lcpp -m ./${mid}-q6-gguf/model.gguf -f "$raw_path" --chunks "${num_samples}" -c "${sequence_length}" 2>&1 | tee ./${mid}-q6-gguf-ppl.txt
|
||||
#fi
|
||||
|
||||
#if [[ "$no_perf" == false ]]; then
|
||||
# ${llama_batched_bench} $args_lcpp -m ./${mid}-q6-gguf/model.gguf -c 33768 -b 2048 -ub 2048 -npp 2048,4096,8192,16384,32768 -ntg 128 -npl 1 2>&1 | tee ./${mid}-q6-gguf-perf.txt
|
||||
#fi
|
||||
|
||||
#if [[ "$no_keep" == true ]]; then
|
||||
# echo "Deleting intermediate model files"
|
||||
# rm -rfv ./${mid}-q6-gguf
|
||||
#fi
|
||||
|
||||
#rm -rfv ./${mid}-q5-gguf && mkdir -p ./${mid}-q5-gguf
|
||||
#${llama_quantize} ./${mid}-f32-gguf/model.gguf ./${mid}-q5-gguf/model.gguf q5_k_s
|
||||
#get_size ./${mid}-q5-gguf > ./${mid}-q5-gguf-size.txt
|
||||
|
||||
#if [[ "$no_ppl" == false ]]; then
|
||||
# ${llama_perplexity} $args_lcpp -m ./${mid}-q5-gguf/model.gguf -f "$raw_path" --chunks "${num_samples}" -c "${sequence_length}" 2>&1 | tee ./${mid}-q5-gguf-ppl.txt
|
||||
#fi
|
||||
|
||||
#if [[ "$no_perf" == false ]]; then
|
||||
# ${llama_batched_bench} $args_lcpp -m ./${mid}-q5-gguf/model.gguf -c 33768 -b 2048 -ub 2048 -npp 2048,4096,8192,16384,32768 -ntg 128 -npl 1 2>&1 | tee ./${mid}-q5-gguf-perf.txt
|
||||
#fi
|
||||
|
||||
#if [[ "$no_keep" == true ]]; then
|
||||
# echo "Deleting intermediate model files"
|
||||
# rm -rfv ./${mid}-q5-gguf
|
||||
#fi
|
||||
|
||||
rm -rfv ./${mid}-q4p-gguf && mkdir -p ./${mid}-q4p-gguf
|
||||
${llama_quantize} ./${mid}-f32-gguf/model.gguf ./${mid}-q4p-gguf/model.gguf q4_k
|
||||
get_size ./${mid}-q4p-gguf > ./${mid}-q4p-gguf-size.txt
|
||||
|
||||
if [[ "$no_ppl" == false ]]; then
|
||||
${llama_perplexity} $args_lcpp -m ./${mid}-q4p-gguf/model.gguf -f "$raw_path" --chunks "${num_samples}" -c "${sequence_length}" 2>&1 | tee ./${mid}-q4p-gguf-ppl.txt
|
||||
fi
|
||||
|
||||
if [[ "$no_perf" == false ]]; then
|
||||
${llama_batched_bench} $args_lcpp -m ./${mid}-q4p-gguf/model.gguf -c 33768 -b 2048 -ub 2048 -npp 2048,4096,8192,16384,32768 -ntg 128 -npl 1 2>&1 | tee ./${mid}-q4p-gguf-perf.txt
|
||||
fi
|
||||
|
||||
if [[ "$no_keep" == true ]]; then
|
||||
echo "Deleting intermediate model files"
|
||||
rm -rfv ./${mid}-q4p-gguf
|
||||
fi
|
||||
|
||||
# note: we use --pure here to match the MLX quantization of the embeddings
|
||||
rm -rfv ./${mid}-q4-gguf && mkdir -p ./${mid}-q4-gguf
|
||||
${llama_quantize} --pure ./${mid}-f32-gguf/model.gguf ./${mid}-q4-gguf/model.gguf q4_0
|
||||
get_size ./${mid}-q4-gguf > ./${mid}-q4-gguf-size.txt
|
||||
|
||||
if [[ "$no_ppl" == false ]]; then
|
||||
${llama_perplexity} $args_lcpp -m ./${mid}-q4-gguf/model.gguf -f "$raw_path" --chunks "${num_samples}" -c "${sequence_length}" 2>&1 | tee ./${mid}-q4-gguf-ppl.txt
|
||||
fi
|
||||
|
||||
if [[ "$no_perf" == false ]]; then
|
||||
${llama_batched_bench} $args_lcpp -m ./${mid}-q4-gguf/model.gguf -c 33768 -b 2048 -ub 2048 -npp 2048,4096,8192,16384,32768 -ntg 128 -npl 1 2>&1 | tee ./${mid}-q4-gguf-perf.txt
|
||||
fi
|
||||
|
||||
if [[ "$no_keep" == true ]]; then
|
||||
echo "Deleting intermediate model files"
|
||||
rm -rfv ./${mid}-q4-gguf
|
||||
fi
|
||||
|
||||
rm -rfv ./${mid}-q3-gguf && mkdir -p ./${mid}-q3-gguf
|
||||
${llama_quantize} ./${mid}-f32-gguf/model.gguf ./${mid}-q3-gguf/model.gguf q3_k_s
|
||||
get_size ./${mid}-q3-gguf > ./${mid}-q3-gguf-size.txt
|
||||
|
||||
if [[ "$no_ppl" == false ]]; then
|
||||
${llama_perplexity} $args_lcpp -m ./${mid}-q3-gguf/model.gguf -f "$raw_path" --chunks "${num_samples}" -c "${sequence_length}" 2>&1 | tee ./${mid}-q3-gguf-ppl.txt
|
||||
fi
|
||||
|
||||
if [[ "$no_perf" == false ]]; then
|
||||
${llama_batched_bench} $args_lcpp -m ./${mid}-q3-gguf/model.gguf -c 33768 -b 2048 -ub 2048 -npp 2048,4096,8192,16384,32768 -ntg 128 -npl 1 2>&1 | tee ./${mid}-q3-gguf-perf.txt
|
||||
fi
|
||||
|
||||
if [[ "$no_keep" == true ]]; then
|
||||
echo "Deleting intermediate model files"
|
||||
rm -rfv ./${mid}-q3-gguf
|
||||
fi
|
||||
|
||||
# remove the f32 model at the end
|
||||
if [[ "$no_keep" == true ]]; then
|
||||
rm -rfv ./${mid}-f32-gguf
|
||||
fi
|
||||
fi
|
||||
|
||||
set +x
|
||||
|
||||
# analyze results
|
||||
|
||||
#types=("f32" "bf16" "f16" "q8" "q6" "q5" "q4p" "q4" "q3")
|
||||
types=("f32" "bf16" "f16" "q8" "q4p" "q4" "q3")
|
||||
|
||||
mlx_ppls=()
|
||||
mlx_ppl_deltas=()
|
||||
mlx_sizes=()
|
||||
mlx_pps2k=()
|
||||
mlx_tgs2k=()
|
||||
mlx_pps4k=()
|
||||
mlx_tgs4k=()
|
||||
mlx_pps8k=()
|
||||
mlx_tgs8k=()
|
||||
mlx_pps16k=()
|
||||
mlx_tgs16k=()
|
||||
mlx_pps32k=()
|
||||
mlx_tgs32k=()
|
||||
|
||||
# mlx:
|
||||
for t in ${types[*]}; do
|
||||
cur_ppl="N/A"
|
||||
cur_ppl_delta="N/A"
|
||||
cur_size="N/A"
|
||||
cur_pp2k="N/A"
|
||||
cur_tg2k="N/A"
|
||||
cur_pp4k="N/A"
|
||||
cur_tg4k="N/A"
|
||||
cur_pp8k="N/A"
|
||||
cur_tg8k="N/A"
|
||||
cur_pp16k="N/A"
|
||||
cur_tg16k="N/A"
|
||||
cur_pp32k="N/A"
|
||||
cur_tg32k="N/A"
|
||||
|
||||
if [[ -f ./${mid}-${t}-mlx-ppl.txt ]]; then
|
||||
cur_ppl=$(grep -o 'Perplexity: [0-9.]*' ./${mid}-${t}-mlx-ppl.txt | cut -d' ' -f2)
|
||||
cur_ppl_delta=$(grep -o 'Perplexity: [0-9.]* ± [0-9.]*' ./${mid}-${t}-mlx-ppl.txt | cut -d' ' -f4)
|
||||
cur_size=$(cat ./${mid}-${t}-mlx-size.txt)
|
||||
cur_pp2k=$(grep -o 'Averages.*prompt_tps=[0-9.]*' ./${mid}-${t}-mlx-perf-2048.txt | cut -d'=' -f2)
|
||||
cur_tg2k=$(grep -o 'Averages.*generation_tps=[0-9.]*' ./${mid}-${t}-mlx-perf-2048.txt | cut -d'=' -f3)
|
||||
cur_pp4k=$(grep -o 'Averages.*prompt_tps=[0-9.]*' ./${mid}-${t}-mlx-perf-4096.txt | cut -d'=' -f2)
|
||||
cur_tg4k=$(grep -o 'Averages.*generation_tps=[0-9.]*' ./${mid}-${t}-mlx-perf-4096.txt | cut -d'=' -f3)
|
||||
cur_pp8k=$(grep -o 'Averages.*prompt_tps=[0-9.]*' ./${mid}-${t}-mlx-perf-8192.txt | cut -d'=' -f2)
|
||||
cur_tg8k=$(grep -o 'Averages.*generation_tps=[0-9.]*' ./${mid}-${t}-mlx-perf-8192.txt | cut -d'=' -f3)
|
||||
cur_pp16k=$(grep -o 'Averages.*prompt_tps=[0-9.]*' ./${mid}-${t}-mlx-perf-16384.txt | cut -d'=' -f2)
|
||||
cur_tg16k=$(grep -o 'Averages.*generation_tps=[0-9.]*' ./${mid}-${t}-mlx-perf-16384.txt | cut -d'=' -f3)
|
||||
cur_pp32k=$(grep -o 'Averages.*prompt_tps=[0-9.]*' ./${mid}-${t}-mlx-perf-32768.txt | cut -d'=' -f2)
|
||||
cur_tg32k=$(grep -o 'Averages.*generation_tps=[0-9.]*' ./${mid}-${t}-mlx-perf-32768.txt | cut -d'=' -f3)
|
||||
fi
|
||||
|
||||
mlx_ppls+=("${cur_ppl}")
|
||||
mlx_ppl_deltas+=("${cur_ppl_delta}")
|
||||
mlx_sizes+=("${cur_size}")
|
||||
mlx_pps2k+=("${cur_pp2k}")
|
||||
mlx_tgs2k+=("${cur_tg2k}")
|
||||
mlx_pps4k+=("${cur_pp4k}")
|
||||
mlx_tgs4k+=("${cur_tg4k}")
|
||||
mlx_pps8k+=("${cur_pp8k}")
|
||||
mlx_tgs8k+=("${cur_tg8k}")
|
||||
mlx_pps16k+=("${cur_pp16k}")
|
||||
mlx_tgs16k+=("${cur_tg16k}")
|
||||
mlx_pps32k+=("${cur_pp32k}")
|
||||
mlx_tgs32k+=("${cur_tg32k}")
|
||||
done
|
||||
|
||||
gguf_ppls=()
|
||||
gguf_ppl_deltas=()
|
||||
gguf_sizes=()
|
||||
gguf_pps2k=()
|
||||
gguf_tgs2k=()
|
||||
gguf_pps4k=()
|
||||
gguf_tgs4k=()
|
||||
gguf_pps8k=()
|
||||
gguf_tgs8k=()
|
||||
gguf_pps16k=()
|
||||
gguf_tgs16k=()
|
||||
gguf_pps32k=()
|
||||
gguf_tgs32k=()
|
||||
|
||||
# gguf:
|
||||
for t in ${types[*]}; do
|
||||
cur_ppl="N/A"
|
||||
cur_ppl_delta="N/A"
|
||||
cur_size="N/A"
|
||||
cur_pp2k="N/A"
|
||||
cur_tg2k="N/A"
|
||||
cur_pp4k="N/A"
|
||||
cur_tg4k="N/A"
|
||||
cur_pp8k="N/A"
|
||||
cur_tg8k="N/A"
|
||||
cur_pp16k="N/A"
|
||||
cur_tg16k="N/A"
|
||||
cur_pp32k="N/A"
|
||||
cur_tg32k="N/A"
|
||||
|
||||
if [[ -f ./${mid}-${t}-gguf-ppl.txt ]]; then
|
||||
cur_ppl=$(grep -o 'Final estimate: PPL = [0-9.]*' ./${mid}-${t}-gguf-ppl.txt | sed -e "s/.*Final//" | cut -d' ' -f5)
|
||||
cur_ppl_delta=$(grep -o 'Final estimate: PPL = [0-9.]* +/- [0-9.]*' ./${mid}-${t}-gguf-ppl.txt | sed -e "s/.*Final//" | cut -d' ' -f7)
|
||||
cur_size=$(cat ./${mid}-${t}-gguf-size.txt)
|
||||
cur_pp2k=$(grep -o '| 2048 |.*' ./${mid}-${t}-gguf-perf.txt | awk '{print $12}')
|
||||
cur_tg2k=$(grep -o '| 2048 |.*' ./${mid}-${t}-gguf-perf.txt | awk '{print $16}')
|
||||
cur_pp4k=$(grep -o '| 4096 |.*' ./${mid}-${t}-gguf-perf.txt | awk '{print $12}')
|
||||
cur_tg4k=$(grep -o '| 4096 |.*' ./${mid}-${t}-gguf-perf.txt | awk '{print $16}')
|
||||
cur_pp8k=$(grep -o '| 8192 |.*' ./${mid}-${t}-gguf-perf.txt | awk '{print $12}')
|
||||
cur_tg8k=$(grep -o '| 8192 |.*' ./${mid}-${t}-gguf-perf.txt | awk '{print $16}')
|
||||
cur_pp16k=$(grep -o '| 16384 |.*' ./${mid}-${t}-gguf-perf.txt | awk '{print $12}')
|
||||
cur_tg16k=$(grep -o '| 16384 |.*' ./${mid}-${t}-gguf-perf.txt | awk '{print $16}')
|
||||
cur_pp32k=$(grep -o '| 32768 |.*' ./${mid}-${t}-gguf-perf.txt | awk '{print $12}')
|
||||
cur_tg32k=$(grep -o '| 32768 |.*' ./${mid}-${t}-gguf-perf.txt | awk '{print $16}')
|
||||
fi
|
||||
|
||||
gguf_ppls+=("${cur_ppl}")
|
||||
gguf_ppl_deltas+=("${cur_ppl_delta}")
|
||||
gguf_sizes+=("${cur_size}")
|
||||
gguf_pps2k+=("${cur_pp2k}")
|
||||
gguf_tgs2k+=("${cur_tg2k}")
|
||||
gguf_pps4k+=("${cur_pp4k}")
|
||||
gguf_tgs4k+=("${cur_tg4k}")
|
||||
gguf_pps8k+=("${cur_pp8k}")
|
||||
gguf_tgs8k+=("${cur_tg8k}")
|
||||
gguf_pps16k+=("${cur_pp16k}")
|
||||
gguf_tgs16k+=("${cur_tg16k}")
|
||||
gguf_pps32k+=("${cur_pp32k}")
|
||||
gguf_tgs32k+=("${cur_tg32k}")
|
||||
done
|
||||
|
||||
res="${mid}-results.txt"
|
||||
echo "Results for ${org}/${mid} saved to ${res}"
|
||||
|
||||
printf "\n" | tee ${res}
|
||||
printf "Model ID: ${org}/${mid}\n" | tee -a ${res}
|
||||
#printf "Samples: ${num_samples}\n" | tee -a ${res}
|
||||
#printf "Sequence Length: ${sequence_length}\n" | tee -a ${res}
|
||||
printf "\n" | tee -a ${res}
|
||||
printf "| Type | MLX PPL | GGUF PPL | MLX Size | GGUF Size | MLX PP 2K | GGUF PP 2K | MLX TG 2K | GGUF TG 2K | MLX PP 4K | GGUF PP 4K | MLX TG 4K | GGUF TG 4K | MLX PP 8K | GGUF PP 8K | MLX TG 8K | GGUF TG 8K | MLX PP 16K | GGUF PP 16K | MLX TG 16K | GGUF TG 16K | MLX PP 32K | GGUF PP 32K | MLX TG 32K | GGUF TG 32K |\n" | tee -a ${res}
|
||||
printf "|-------|---------------------|------------------------|----------|-----------| ---------- | ----------- | ---------- | ----------- | ---------- | ----------- | ---------- | ----------- | ---------- | ----------- | ---------- | ----------- | ---------- | ----------- | ---------- | ----------- | ---------- | ----------- | ---------- | ----------- |\n" | tee -a ${res}
|
||||
|
||||
for i in "${!types[@]}"; do
|
||||
printf "| %-5s | %10s ± %6s | %10s ± %9s | %8s | %9s | %10s | %11s | %10s | %11s | %10s | %11s | %10s | %11s | %10s | %11s | %10s | %11s | %10s | %11s | %10s | %11s | %10s | %11s | %10s | %11s |\n" \
|
||||
"${types[i]}" \
|
||||
"${mlx_ppls[i]}" \
|
||||
"${mlx_ppl_deltas[i]}" \
|
||||
"${gguf_ppls[i]}" \
|
||||
"${gguf_ppl_deltas[i]}" \
|
||||
"${mlx_sizes[i]}" \
|
||||
"${gguf_sizes[i]}" \
|
||||
"${mlx_pps2k[i]}" \
|
||||
"${gguf_pps2k[i]}" \
|
||||
"${mlx_tgs2k[i]}" \
|
||||
"${gguf_tgs2k[i]}" \
|
||||
"${mlx_pps4k[i]}" \
|
||||
"${gguf_pps4k[i]}" \
|
||||
"${mlx_tgs4k[i]}" \
|
||||
"${gguf_tgs4k[i]}" \
|
||||
"${mlx_pps8k[i]}" \
|
||||
"${gguf_pps8k[i]}" \
|
||||
"${mlx_tgs8k[i]}" \
|
||||
"${gguf_tgs8k[i]}" \
|
||||
"${mlx_pps16k[i]}" \
|
||||
"${gguf_pps16k[i]}" \
|
||||
"${mlx_tgs16k[i]}" \
|
||||
"${gguf_tgs16k[i]}" \
|
||||
"${mlx_pps32k[i]}" \
|
||||
"${gguf_pps32k[i]}" \
|
||||
"${mlx_tgs32k[i]}" \
|
||||
"${gguf_tgs32k[i]}" | tee -a ${res}
|
||||
done
|
||||
}
|
||||
|
||||
eval_model "meta-llama" "Llama-3.2-1B"
|
||||
eval_model "meta-llama" "Llama-3.2-3B"
|
||||
eval_model "meta-llama" "Llama-3.1-8B"
|
||||
|
||||
eval_model "google" "gemma-3-270m"
|
||||
eval_model "google" "gemma-3-1b-pt"
|
||||
#eval_model "google" "gemma-3-4b-pt"
|
||||
|
||||
# the mlx-ppl.y script does not work with these models - not sure why
|
||||
#eval_model "google" "gemma-3n-E2B"
|
||||
#eval_model "google" "gemma-3n-E4B"
|
||||
|
||||
eval_model "Qwen" "Qwen3-0.6B-Base"
|
||||
eval_model "Qwen" "Qwen3-1.7B-Base"
|
||||
eval_model "Qwen" "Qwen3-4B-Base"
|
||||
eval_model "Qwen" "Qwen3-8B-Base"
|
||||
eval_model "Qwen" "Qwen3-30B-A3B-Base"
|
||||
120
examples/compare-mlx/inspect_model.py
Normal file
120
examples/compare-mlx/inspect_model.py
Normal file
@@ -0,0 +1,120 @@
|
||||
#!/usr/bin/env python3
|
||||
# generated by Claude
|
||||
"""
|
||||
Script to inspect SafeTensors model files and print tensor information.
|
||||
"""
|
||||
|
||||
import json
|
||||
from safetensors import safe_open
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
def inspect_safetensors_model(model_dir="."):
|
||||
"""Inspect all SafeTensors files in the model directory."""
|
||||
|
||||
# First, let's read the index file to see the file structure
|
||||
index_file = Path(model_dir) / "model.safetensors.index.json"
|
||||
|
||||
if index_file.exists():
|
||||
with open(index_file, 'r') as f:
|
||||
index_data = json.load(f)
|
||||
|
||||
print("=== Model Structure ===")
|
||||
print(f"Total parameters: {index_data.get('metadata', {}).get('total_size', 'Unknown')}")
|
||||
print()
|
||||
|
||||
# Get all safetensor files
|
||||
safetensor_files = set(index_data.get('weight_map', {}).values())
|
||||
else:
|
||||
# If no index file, look for safetensor files directly
|
||||
safetensor_files = [f for f in os.listdir(model_dir) if f.endswith('.safetensors')]
|
||||
|
||||
# Sort files for consistent output
|
||||
safetensor_files = sorted(safetensor_files)
|
||||
|
||||
print("=== Tensor Information ===")
|
||||
print(f"{'Tensor Name':<50} {'Shape':<25} {'Data Type':<15} {'File'}")
|
||||
print("-" * 110)
|
||||
|
||||
total_tensors = 0
|
||||
|
||||
for filename in safetensor_files:
|
||||
filepath = Path(model_dir) / filename
|
||||
if not filepath.exists():
|
||||
continue
|
||||
|
||||
print(f"\n--- {filename} ---")
|
||||
|
||||
# Open and inspect the safetensor file
|
||||
with safe_open(filepath, framework="pt") as f: # Use PyTorch framework for better dtype support
|
||||
tensor_names = f.keys()
|
||||
|
||||
for tensor_name in sorted(tensor_names):
|
||||
# Get tensor metadata without loading the full tensor
|
||||
tensor_slice = f.get_slice(tensor_name)
|
||||
shape = tensor_slice.get_shape()
|
||||
dtype = tensor_slice.get_dtype()
|
||||
|
||||
shape_str = str(tuple(shape))
|
||||
dtype_str = str(dtype)
|
||||
|
||||
print(f"{tensor_name:<50} {shape_str:<25} {dtype_str:<15} {filename}")
|
||||
total_tensors += 1
|
||||
|
||||
print(f"\nTotal tensors found: {total_tensors}")
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Inspect SafeTensors model files")
|
||||
parser.add_argument("--model-dir", "-d", default=".",
|
||||
help="Directory containing the model files (default: current directory)")
|
||||
parser.add_argument("--summary", "-s", action="store_true",
|
||||
help="Show only summary statistics")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.summary:
|
||||
print_summary_only(args.model_dir)
|
||||
else:
|
||||
inspect_safetensors_model(args.model_dir)
|
||||
|
||||
def print_summary_only(model_dir="."):
|
||||
"""Print only summary statistics."""
|
||||
safetensor_files = [f for f in os.listdir(model_dir) if f.endswith('.safetensors')]
|
||||
|
||||
total_tensors = 0
|
||||
dtype_counts = {}
|
||||
total_params = 0
|
||||
|
||||
for filename in sorted(safetensor_files):
|
||||
filepath = Path(model_dir) / filename
|
||||
if not filepath.exists():
|
||||
continue
|
||||
|
||||
with safe_open(filepath, framework="pt") as f: # Use PyTorch framework
|
||||
for tensor_name in f.keys():
|
||||
tensor_slice = f.get_slice(tensor_name)
|
||||
shape = tensor_slice.get_shape()
|
||||
dtype = tensor_slice.get_dtype()
|
||||
|
||||
total_tensors += 1
|
||||
|
||||
dtype_str = str(dtype)
|
||||
dtype_counts[dtype_str] = dtype_counts.get(dtype_str, 0) + 1
|
||||
|
||||
# Calculate parameter count
|
||||
param_count = 1
|
||||
for dim in shape:
|
||||
param_count *= dim
|
||||
total_params += param_count
|
||||
|
||||
print("=== Model Summary ===")
|
||||
print(f"Total tensors: {total_tensors}")
|
||||
print(f"Total parameters: {total_params:,}")
|
||||
print(f"Data type distribution:")
|
||||
for dtype, count in sorted(dtype_counts.items()):
|
||||
print(f" {dtype}: {count} tensors")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
305
examples/compare-mlx/mlx-ppl.py
Normal file
305
examples/compare-mlx/mlx-ppl.py
Normal file
@@ -0,0 +1,305 @@
|
||||
# Copyright © 2025 Apple Inc.
|
||||
# modified: https://github.com/ml-explore/mlx-lm/blob/60320dc2347d45dc3ca08be90e5255fb9424bb09/mlx_lm/perplexity.py
|
||||
"""
|
||||
Evaluate perplexity (PPL) of pre-trained MLX models in the same way as llama.cpp's llama-perplexity.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import math
|
||||
import os
|
||||
import time
|
||||
import types
|
||||
|
||||
import mlx.core as mx
|
||||
import mlx.nn as nn
|
||||
import numpy as np
|
||||
|
||||
from mlx_lm.tuner.datasets import load_dataset
|
||||
from mlx_lm.tuner.utils import get_total_parameters
|
||||
from mlx_lm.utils import load
|
||||
|
||||
|
||||
def load_data(
|
||||
tokenizer,
|
||||
data_path: str,
|
||||
num_samples: int,
|
||||
sequence_length: int,
|
||||
):
|
||||
"""
|
||||
Load a Hugging‑Face dataset (via mlx‑lm’s dataset utilities) and convert it
|
||||
into a token tensor of shape (N, sequence_length).
|
||||
"""
|
||||
args = types.SimpleNamespace(
|
||||
hf_dataset={
|
||||
"path": data_path,
|
||||
"train_split": "train",
|
||||
"valid_split": "train[:1]",
|
||||
},
|
||||
train=True,
|
||||
test=False,
|
||||
)
|
||||
dataset = load_dataset(args, tokenizer)[0]
|
||||
|
||||
perm = np.random.permutation(len(dataset)).tolist()
|
||||
|
||||
num_tokens = sequence_length * num_samples if num_samples > 0 else float("inf")
|
||||
data = []
|
||||
i = 0
|
||||
while len(data) < num_tokens:
|
||||
tokens, _ = dataset.process(dataset[perm[i]])
|
||||
i += 1
|
||||
data.extend(tokens)
|
||||
|
||||
# Convert to MX array, truncate to a multiple of `sequence_length`
|
||||
data = mx.array(data[: (len(data) // sequence_length) * sequence_length])
|
||||
data = data.reshape(-1, sequence_length)
|
||||
if num_samples > 0:
|
||||
data = data[:num_samples]
|
||||
return data
|
||||
|
||||
|
||||
def _tokenize_text(tokenizer, text: str):
|
||||
"""
|
||||
Helper that tokenises a string using the MLX‑LM tokenizer.
|
||||
Supports the common `encode` method or a callable tokenizer.
|
||||
"""
|
||||
# Most mlx‑lm tokenizers expose an `encode` method.
|
||||
if hasattr(tokenizer, "encode"):
|
||||
tokens = tokenizer.encode(text)
|
||||
elif callable(tokenizer):
|
||||
tokens = tokenizer(text)
|
||||
else:
|
||||
raise AttributeError(
|
||||
"Tokenizer does not have an `encode` method nor is it callable."
|
||||
)
|
||||
# Normalise the output to a Python list of ints.
|
||||
if isinstance(tokens, mx.array):
|
||||
tokens = tokens.tolist()
|
||||
return tokens
|
||||
|
||||
|
||||
# load a raw text file and tokenize it
|
||||
# generated with gpt-oss-120b
|
||||
def load_raw_data(
|
||||
tokenizer,
|
||||
raw_path: str,
|
||||
num_samples: int,
|
||||
sequence_length: int,
|
||||
):
|
||||
"""
|
||||
Load a raw text file, tokenize it, and reshape into a (N, sequence_length)
|
||||
tensor suitable for perplexity evaluation.
|
||||
"""
|
||||
if not os.path.isfile(raw_path):
|
||||
raise FileNotFoundError(f"Raw text file not found: {raw_path}")
|
||||
|
||||
# Read the whole file (UTF‑8). Users can supply any plain‑text corpus.
|
||||
with open(raw_path, "r", encoding="utf-8") as fp:
|
||||
raw_text = fp.read()
|
||||
|
||||
# Tokenise the complete text.
|
||||
token_list = _tokenize_text(tokenizer, raw_text)
|
||||
|
||||
if len(token_list) == 0:
|
||||
raise ValueError("Tokenisation of the raw file produced no tokens.")
|
||||
|
||||
# Convert to MX array (int32 is sufficient for token IDs).
|
||||
token_array = mx.array(token_list, dtype=mx.int32)
|
||||
|
||||
# Trim to a length that is an exact multiple of `sequence_length`.
|
||||
total_len = (token_array.shape[0] // sequence_length) * sequence_length
|
||||
token_array = token_array[:total_len]
|
||||
|
||||
# Reshape into (num_sequences, sequence_length)
|
||||
data = token_array.reshape(-1, sequence_length)
|
||||
|
||||
if num_samples > 0:
|
||||
data = data[:num_samples]
|
||||
|
||||
#print(f"First 4 samples of the data:")
|
||||
#for j in range(min(4, len(data))):
|
||||
# print(f" Sample {j}: {tokenizer.decode(data[j].tolist())}\n\n-------------------\n\n")
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def eval_ppl(model, tokenizer, data, batch_size=8):
|
||||
"""
|
||||
Evaluate perplexity on a dataset with standard error calculation.
|
||||
|
||||
Args:
|
||||
model: The model to evaluate.
|
||||
data: Tokenized data tensor (shape: N x L).
|
||||
batch_size: Batch size for evaluation.
|
||||
|
||||
Returns:
|
||||
tuple: (perplexity, standard_error_of_perplexity)
|
||||
"""
|
||||
all_losses = []
|
||||
|
||||
num_batches = (len(data) + batch_size - 1) // batch_size
|
||||
for i, s in enumerate(range(0, len(data), batch_size)):
|
||||
batch = data[s : s + batch_size]
|
||||
|
||||
# Set the first token of all samples to the BOS token
|
||||
if tokenizer.bos_token_id:
|
||||
batch[:, 0] = tokenizer.bos_token_id
|
||||
|
||||
# compute cross entropy only with the second half of the sequence to match llama.cpp behavior
|
||||
# ref: https://github.com/ggml-org/llama.cpp/blob/696fccf354e9dbdfbce135bc40b44c9dcc64dda9/tools/perplexity/perplexity.cpp#L527-L541
|
||||
#
|
||||
#start = 0
|
||||
start = batch.shape[1] // 2
|
||||
|
||||
# Forward pass: get logits for all tokens except last
|
||||
logits = model(batch[:, :-1]).astype(mx.float32)
|
||||
|
||||
# Calculate cross‑entropy loss with next tokens
|
||||
#losses = nn.losses.cross_entropy(logits, batch[:, 1:], reduction="none")
|
||||
losses = nn.losses.cross_entropy(logits[:, start:, :], batch[:, start+1:], reduction="none")
|
||||
|
||||
mx.eval(losses)
|
||||
# Store individual token losses
|
||||
all_losses.append(losses.flatten())
|
||||
|
||||
# Progress indicator
|
||||
if (i + 1) % 1 == 0 or (i + 1) == num_batches:
|
||||
print(f" Processed {i + 1}/{num_batches} batches...", end="\r")
|
||||
|
||||
print() # New line after progress
|
||||
|
||||
# Concatenate all losses into a single array
|
||||
all_losses = mx.concatenate(all_losses)
|
||||
|
||||
# Calculate mean loss and perplexity
|
||||
mean_loss = all_losses.mean().item()
|
||||
ppl = math.exp(mean_loss)
|
||||
|
||||
# Calculate standard error
|
||||
std_dev = mx.sqrt(mx.var(all_losses, ddof=1)).item()
|
||||
num_tokens = all_losses.size
|
||||
standard_error = std_dev / math.sqrt(num_tokens)
|
||||
|
||||
# Delta approximation for standard error of perplexity
|
||||
standard_error_ppl = ppl * standard_error
|
||||
|
||||
return ppl, standard_error_ppl
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Evaluate perplexity of MLX models")
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to model or Hugging Face model ID",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--batch-size", type=int, default=8, help="Batch size for evaluation"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sequence-length",
|
||||
type=int,
|
||||
default=512,
|
||||
help="Sequence length for evaluation",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-samples",
|
||||
type=int,
|
||||
default=256,
|
||||
help="Number of samples to use (-1 for all available)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--data-path",
|
||||
type=str,
|
||||
default="allenai/tulu-3-sft-mixture",
|
||||
help=(
|
||||
"A Hugging Face dataset compatible with mlx‑lm. "
|
||||
"Ignored if --raw-path is provided."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--raw-path",
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
"Path to a local raw‑text file to use for evaluation. "
|
||||
"If specified, the script skips loading a HF dataset."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--seed", type=int, default=123, help="Random seed for data sampling"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Set random seed (used for HF dataset shuffling)
|
||||
mx.random.seed(args.seed)
|
||||
|
||||
# Load model
|
||||
print(f"Loading model from {args.model}...")
|
||||
model, tokenizer = load(args.model)
|
||||
|
||||
# Count parameters
|
||||
total_params = get_total_parameters(model)
|
||||
print(f"Model loaded: {total_params/1e6:.1f}M parameters")
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Load evaluation data (raw file vs. HF dataset)
|
||||
# ----------------------------------------------------------------------
|
||||
print("\nLoading dataset...")
|
||||
print(f" Sequence length: {args.sequence_length}")
|
||||
|
||||
if args.raw_path:
|
||||
print(f" Using raw text file: {args.raw_path}")
|
||||
data = load_raw_data(
|
||||
tokenizer,
|
||||
raw_path=args.raw_path,
|
||||
num_samples=args.num_samples,
|
||||
sequence_length=args.sequence_length,
|
||||
)
|
||||
else:
|
||||
print(f" Using HF dataset: {args.data_path}")
|
||||
data = load_data(
|
||||
tokenizer,
|
||||
data_path=args.data_path,
|
||||
num_samples=args.num_samples,
|
||||
sequence_length=args.sequence_length,
|
||||
)
|
||||
|
||||
print(f" Loaded {len(data)} samples")
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Evaluate perplexity
|
||||
# ----------------------------------------------------------------------
|
||||
print(f"\nEvaluating perplexity with batch size {args.batch_size}...")
|
||||
start_time = time.time()
|
||||
|
||||
ppl, se = eval_ppl(model, tokenizer, data, batch_size=args.batch_size)
|
||||
|
||||
eval_time = time.time() - start_time
|
||||
tokens_evaluated = data.shape[0] * (data.shape[1] - 1) # B * (L - 1)
|
||||
|
||||
# Print results
|
||||
print("\n" + "=" * 60)
|
||||
print("EVALUATION RESULTS")
|
||||
print("=" * 60)
|
||||
print(f"Model: {args.model}")
|
||||
print(f"Perplexity: {ppl:.3f} ± {se:.3f}")
|
||||
print(f"Evaluation time: {eval_time:.2f} seconds")
|
||||
print(f"Peak memory: {mx.get_peak_memory() / 1e9:.2f} GB")
|
||||
print(f"Tokens per second: {tokens_evaluated / eval_time:.0f}")
|
||||
|
||||
# Additional statistics
|
||||
print(f"\nDataset statistics:")
|
||||
print(f" Total samples: {len(data)}")
|
||||
print(f" Total tokens: {data.size}")
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Done
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -50,6 +50,12 @@ int main(int argc, char ** argv) {
|
||||
const int N = 5; // n-gram size
|
||||
const int G = 15; // max verification n-grams
|
||||
|
||||
// lookahead requires W + G + 1 sequences for parallel Jacobi decoding
|
||||
params.n_parallel = W + G + 1;
|
||||
|
||||
// unified KV cache is required for coupled sequences in batch splitting
|
||||
params.kv_unified = true;
|
||||
|
||||
// init llama.cpp
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
@@ -115,7 +121,7 @@ int main(int argc, char ** argv) {
|
||||
// seq_id == 0 : the current input token
|
||||
// seq_id [1, W] : tokens from the past N - 1 Jacobi iterations
|
||||
// seq_id [W + 1, W + G] : verification n-grams
|
||||
llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
|
||||
llama_batch batch = llama_batch_init(llama_n_ctx(ctx), 0, W + G + 1);
|
||||
|
||||
// target model sampling context
|
||||
struct common_sampler * smpl = common_sampler_init(model, params.sampling);
|
||||
|
||||
@@ -106,7 +106,7 @@ int main(int argc, char ** argv){
|
||||
|
||||
std::vector<llama_token> draft;
|
||||
|
||||
llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, 1);
|
||||
llama_batch batch_tgt = llama_batch_init(llama_n_ctx(ctx), 0, 1);
|
||||
|
||||
const auto t_dec_start = ggml_time_us();
|
||||
|
||||
|
||||
@@ -222,6 +222,7 @@ if (GGML_SCHED_NO_REALLOC)
|
||||
endif()
|
||||
|
||||
add_library(ggml
|
||||
ggml-backend-dl.cpp
|
||||
ggml-backend-reg.cpp)
|
||||
add_library(ggml::ggml ALIAS ggml)
|
||||
|
||||
|
||||
48
ggml/src/ggml-backend-dl.cpp
Normal file
48
ggml/src/ggml-backend-dl.cpp
Normal file
@@ -0,0 +1,48 @@
|
||||
#include "ggml-backend-dl.h"
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
dl_handle * dl_load_library(const fs::path & path) {
|
||||
// suppress error dialogs for missing DLLs
|
||||
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
|
||||
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
|
||||
|
||||
HMODULE handle = LoadLibraryW(path.wstring().c_str());
|
||||
|
||||
SetErrorMode(old_mode);
|
||||
|
||||
return handle;
|
||||
}
|
||||
|
||||
void * dl_get_sym(dl_handle * handle, const char * name) {
|
||||
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
|
||||
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
|
||||
|
||||
void * p = (void *) GetProcAddress(handle, name);
|
||||
|
||||
SetErrorMode(old_mode);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
const char * dl_error() {
|
||||
return "";
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
dl_handle * dl_load_library(const fs::path & path) {
|
||||
dl_handle * handle = dlopen(path.string().c_str(), RTLD_NOW | RTLD_LOCAL);
|
||||
return handle;
|
||||
}
|
||||
|
||||
void * dl_get_sym(dl_handle * handle, const char * name) {
|
||||
return dlsym(handle, name);
|
||||
}
|
||||
|
||||
const char * dl_error() {
|
||||
const char *rslt = dlerror();
|
||||
return rslt != nullptr ? rslt : "";
|
||||
}
|
||||
|
||||
#endif
|
||||
45
ggml/src/ggml-backend-dl.h
Normal file
45
ggml/src/ggml-backend-dl.h
Normal file
@@ -0,0 +1,45 @@
|
||||
#pragma once
|
||||
|
||||
#ifdef _WIN32
|
||||
# define WIN32_LEAN_AND_MEAN
|
||||
# ifndef NOMINMAX
|
||||
# define NOMINMAX
|
||||
# endif
|
||||
# include <windows.h>
|
||||
# include <winevt.h>
|
||||
#else
|
||||
# include <dlfcn.h>
|
||||
# include <unistd.h>
|
||||
#endif
|
||||
#include <filesystem>
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
using dl_handle = std::remove_pointer_t<HMODULE>;
|
||||
|
||||
struct dl_handle_deleter {
|
||||
void operator()(HMODULE handle) {
|
||||
FreeLibrary(handle);
|
||||
}
|
||||
};
|
||||
|
||||
#else
|
||||
|
||||
using dl_handle = void;
|
||||
|
||||
struct dl_handle_deleter {
|
||||
void operator()(void * handle) {
|
||||
dlclose(handle);
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
|
||||
|
||||
dl_handle * dl_load_library(const fs::path & path);
|
||||
void * dl_get_sym(dl_handle * handle, const char * name);
|
||||
const char * dl_error();
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-backend-dl.h"
|
||||
#include "ggml-impl.h"
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
@@ -98,72 +99,6 @@ static std::string path_str(const fs::path & path) {
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
using dl_handle = std::remove_pointer_t<HMODULE>;
|
||||
|
||||
struct dl_handle_deleter {
|
||||
void operator()(HMODULE handle) {
|
||||
FreeLibrary(handle);
|
||||
}
|
||||
};
|
||||
|
||||
static dl_handle * dl_load_library(const fs::path & path) {
|
||||
// suppress error dialogs for missing DLLs
|
||||
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
|
||||
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
|
||||
|
||||
HMODULE handle = LoadLibraryW(path.wstring().c_str());
|
||||
|
||||
SetErrorMode(old_mode);
|
||||
|
||||
return handle;
|
||||
}
|
||||
|
||||
static void * dl_get_sym(dl_handle * handle, const char * name) {
|
||||
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
|
||||
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
|
||||
|
||||
void * p = (void *) GetProcAddress(handle, name);
|
||||
|
||||
SetErrorMode(old_mode);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
static const char * dl_error() {
|
||||
return "";
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
using dl_handle = void;
|
||||
|
||||
struct dl_handle_deleter {
|
||||
void operator()(void * handle) {
|
||||
dlclose(handle);
|
||||
}
|
||||
};
|
||||
|
||||
static void * dl_load_library(const fs::path & path) {
|
||||
dl_handle * handle = dlopen(path.string().c_str(), RTLD_NOW | RTLD_LOCAL);
|
||||
|
||||
return handle;
|
||||
}
|
||||
|
||||
static void * dl_get_sym(dl_handle * handle, const char * name) {
|
||||
return dlsym(handle, name);
|
||||
}
|
||||
|
||||
static const char * dl_error() {
|
||||
const char *rslt = dlerror();
|
||||
return rslt != nullptr ? rslt : "";
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
|
||||
|
||||
struct ggml_backend_reg_entry {
|
||||
ggml_backend_reg_t reg;
|
||||
dl_handle_ptr handle;
|
||||
|
||||
@@ -1122,15 +1122,18 @@ struct ggml_tensor_extra_gpu {
|
||||
#endif
|
||||
|
||||
struct ggml_cuda_graph_node_properties {
|
||||
void * node_address;
|
||||
void * node_data;
|
||||
ggml_op node_op;
|
||||
enum ggml_type node_type;
|
||||
int32_t flags;
|
||||
int64_t ne[GGML_MAX_DIMS];
|
||||
size_t nb[GGML_MAX_DIMS];
|
||||
void * src_address[GGML_MAX_SRC];
|
||||
void * src_data[GGML_MAX_SRC];
|
||||
int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
|
||||
};
|
||||
|
||||
static_assert(std::is_trivial<ggml_cuda_graph_node_properties>::value, "ggml_cuda_graph_node_properties must be trivial");
|
||||
|
||||
struct ggml_cuda_graph {
|
||||
#ifdef USE_CUDA_GRAPH
|
||||
~ggml_cuda_graph() {
|
||||
@@ -1150,6 +1153,12 @@ struct ggml_cuda_graph {
|
||||
int number_consecutive_updates = 0;
|
||||
std::vector<ggml_cuda_graph_node_properties> props;
|
||||
|
||||
// these are extra tensors (inputs) that participate in the ggml graph but are not nodes
|
||||
// they properties also have to match in order to be able to safely reuse a CUDA graph
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/18583
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/19165
|
||||
std::vector<ggml_cuda_graph_node_properties> extra;
|
||||
|
||||
void record_update(bool use_graph, bool update_required) {
|
||||
if (use_graph && update_required) {
|
||||
number_consecutive_updates++;
|
||||
|
||||
@@ -310,8 +310,6 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
|
||||
}
|
||||
}
|
||||
|
||||
const bool V_is_K_view = V->view_src && (V->view_src == K || (V->view_src == K->view_src && V->view_offs == K->view_offs));
|
||||
|
||||
const int cc = ggml_cuda_info().devices[device].cc;
|
||||
|
||||
switch (K->ne[0]) {
|
||||
@@ -334,9 +332,6 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
|
||||
if (!gqa_opt_applies) {
|
||||
return BEST_FATTN_KERNEL_NONE;
|
||||
}
|
||||
if (!V_is_K_view) {
|
||||
return BEST_FATTN_KERNEL_NONE;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
return BEST_FATTN_KERNEL_NONE;
|
||||
|
||||
@@ -70,17 +70,18 @@
|
||||
#include <condition_variable>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <float.h>
|
||||
#include <cfloat>
|
||||
#include <initializer_list>
|
||||
#include <limits>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <stdarg.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <cstdarg>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <unordered_set>
|
||||
|
||||
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
||||
|
||||
@@ -2916,22 +2917,27 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
|
||||
}
|
||||
|
||||
static void ggml_cuda_graph_node_set_properties(ggml_cuda_graph_node_properties * props, ggml_tensor * node) {
|
||||
props->node_address = node->data;
|
||||
memset(props, 0, sizeof(ggml_cuda_graph_node_properties));
|
||||
props->node_data = node->data;
|
||||
props->node_op = node->op;
|
||||
props->node_type = node->type;
|
||||
props->flags = node->flags;
|
||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
props->ne[i] = node->ne[i];
|
||||
props->nb[i] = node->nb[i];
|
||||
}
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
props->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
|
||||
if (!node->src[i]) {
|
||||
continue;
|
||||
}
|
||||
|
||||
props->src_data[i] = node->src[i]->data;
|
||||
}
|
||||
memcpy(props->op_params, node->op_params, GGML_MAX_OP_PARAMS);
|
||||
}
|
||||
|
||||
static bool ggml_cuda_graph_node_properties_match(ggml_tensor * node, ggml_cuda_graph_node_properties * props) {
|
||||
if (node->data != props->node_address &&
|
||||
node->op != GGML_OP_VIEW) {
|
||||
if (node->data != props->node_data && node->op != GGML_OP_VIEW) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -2939,6 +2945,10 @@ static bool ggml_cuda_graph_node_properties_match(ggml_tensor * node, ggml_cuda_
|
||||
return false;
|
||||
}
|
||||
|
||||
if (node->type != props->node_type) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
if (node->ne[i] != props->ne[i]) {
|
||||
return false;
|
||||
@@ -2948,12 +2958,18 @@ static bool ggml_cuda_graph_node_properties_match(ggml_tensor * node, ggml_cuda_
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (node->src[i] &&
|
||||
node->src[i]->data != props->src_address[i] &&
|
||||
node->op != GGML_OP_VIEW
|
||||
) {
|
||||
return false;
|
||||
if (node->op != GGML_OP_VIEW) {
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (!node->src[i]) {
|
||||
if (props->src_data[i] != nullptr) {
|
||||
return false;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (node->src[i]->data != props->src_data[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2974,7 +2990,6 @@ static const void * ggml_cuda_graph_get_key(ggml_cgraph * cgraph) {
|
||||
}
|
||||
|
||||
static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) {
|
||||
|
||||
bool res = false;
|
||||
|
||||
const void * graph_key = ggml_cuda_graph_get_key(cgraph);
|
||||
@@ -2985,15 +3000,20 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx
|
||||
}
|
||||
|
||||
// Check if the graph size has changed
|
||||
if (graph->props.size() != (size_t)cgraph->n_nodes + cgraph->n_leafs) {
|
||||
if (graph->props.size() != (size_t)cgraph->n_nodes) {
|
||||
res = true;
|
||||
graph->props.resize(cgraph->n_nodes + cgraph->n_leafs);
|
||||
graph->props.resize(cgraph->n_nodes);
|
||||
}
|
||||
|
||||
// Loop over nodes in GGML graph to determine if CUDA graph update is required
|
||||
// and store properties to allow this comparison for the next token
|
||||
std::unordered_set<ggml_tensor *> seen_node;
|
||||
std::vector<ggml_tensor *> srcs_extra;
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
bool props_match = true;
|
||||
|
||||
seen_node.insert(cgraph->nodes[i]);
|
||||
|
||||
if (!res) {
|
||||
props_match = ggml_cuda_graph_node_properties_match(cgraph->nodes[i], &graph->props[i]);
|
||||
}
|
||||
@@ -3001,17 +3021,31 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx
|
||||
res = true;
|
||||
}
|
||||
ggml_cuda_graph_node_set_properties(&graph->props[i], cgraph->nodes[i]);
|
||||
|
||||
for (int src_idx = 0; src_idx < GGML_MAX_SRC; ++src_idx) {
|
||||
ggml_tensor * src = cgraph->nodes[i]->src[src_idx];
|
||||
if (src && seen_node.find(src) == seen_node.end()) {
|
||||
srcs_extra.push_back(src);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < cgraph->n_leafs; i++) {
|
||||
if (graph->extra.size() != (size_t) srcs_extra.size()) {
|
||||
res = true;
|
||||
graph->extra.resize(srcs_extra.size());
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < srcs_extra.size(); ++i) {
|
||||
bool props_match = true;
|
||||
|
||||
if (!res) {
|
||||
props_match = ggml_cuda_graph_node_properties_match(cgraph->leafs[i], &graph->props[cgraph->n_nodes + i]);
|
||||
props_match = ggml_cuda_graph_node_properties_match(srcs_extra[i], &graph->extra[i]);
|
||||
}
|
||||
|
||||
if (!props_match) {
|
||||
res = true;
|
||||
}
|
||||
ggml_cuda_graph_node_set_properties(&graph->props[cgraph->n_nodes + i], cgraph->leafs[i]);
|
||||
ggml_cuda_graph_node_set_properties(&graph->extra[i], srcs_extra[i]);
|
||||
}
|
||||
|
||||
return res;
|
||||
@@ -3080,63 +3114,166 @@ static bool ggml_cuda_should_fuse_rope_set_rows(const ggml_tensor * rope,
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, std::initializer_list<enum ggml_op> ops, std::initializer_list<enum ggml_unary_op> unary_ops) {
|
||||
static bool ggml_cuda_topk_moe_fusion(const struct ggml_cgraph * cgraph, int node_idx, ggml_cuda_topk_moe_args & args) {
|
||||
args.sigmoid = false;
|
||||
args.softmax = false;
|
||||
args.delayed_softmax = false;
|
||||
args.prob_bias = false;
|
||||
args.norm = false;
|
||||
|
||||
const int n_nodes = cgraph->n_nodes;
|
||||
ggml_tensor ** nodes = cgraph->nodes;
|
||||
|
||||
if (nodes[node_idx]->op == GGML_OP_SOFT_MAX) {
|
||||
args.softmax = true;
|
||||
}
|
||||
|
||||
if (nodes[node_idx]->op == GGML_OP_UNARY) {
|
||||
if (ggml_get_unary_op(nodes[node_idx]) != GGML_UNARY_OP_SIGMOID) {
|
||||
return false;
|
||||
}
|
||||
args.sigmoid = true;
|
||||
}
|
||||
|
||||
if (nodes[node_idx]->op == GGML_OP_ARGSORT) {
|
||||
args.delayed_softmax = true;
|
||||
}
|
||||
|
||||
node_idx++;
|
||||
|
||||
if (args.sigmoid || args.softmax) {
|
||||
// SOFTMAX -> RESHAPE
|
||||
if (node_idx >= n_nodes || nodes[node_idx]->op != GGML_OP_RESHAPE ||
|
||||
nodes[node_idx]->src[0] != nodes[node_idx - 1]) {
|
||||
return false;
|
||||
}
|
||||
ggml_tensor * probs_reshaped = nodes[node_idx];
|
||||
node_idx++;
|
||||
|
||||
if (node_idx >= n_nodes) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// src of bias add is the unreshaped probs (-2 instead of -1)
|
||||
if (nodes[node_idx]->op == GGML_OP_ADD && nodes[node_idx]->src[0] == nodes[node_idx - 2]) {
|
||||
args.prob_bias = true;
|
||||
node_idx++;
|
||||
}
|
||||
// RESHAPE/ADD -> ARGSORT
|
||||
if (node_idx >= n_nodes || nodes[node_idx]->op != GGML_OP_ARGSORT) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (args.prob_bias && nodes[node_idx]->src[0] != nodes[node_idx - 1]) {
|
||||
return false;
|
||||
} else if (!args.prob_bias && nodes[node_idx]->src[0] != nodes[node_idx - 2]) {
|
||||
return false;
|
||||
}
|
||||
|
||||
node_idx++;
|
||||
|
||||
// ARGSORT-> VIEW
|
||||
if (node_idx >= n_nodes || nodes[node_idx]->op != GGML_OP_VIEW ||
|
||||
nodes[node_idx]->src[0] != nodes[node_idx - 1]) {
|
||||
return false;
|
||||
}
|
||||
node_idx++;
|
||||
|
||||
if (node_idx >= n_nodes || nodes[node_idx]->op != GGML_OP_GET_ROWS) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// GET_ROWS
|
||||
if (nodes[node_idx]->src[0] != probs_reshaped || nodes[node_idx]->src[1] != nodes[node_idx - 1]) {
|
||||
return false;
|
||||
}
|
||||
node_idx++;
|
||||
} else if (args.delayed_softmax) {
|
||||
if (node_idx - 2 < 0) {
|
||||
return false;
|
||||
}
|
||||
ggml_tensor * probs_reshaped = nodes[node_idx - 2];
|
||||
|
||||
// VIEW->ARGSORT
|
||||
if (node_idx >= n_nodes || nodes[node_idx]->op != GGML_OP_VIEW ||
|
||||
nodes[node_idx]->src[0] != nodes[node_idx - 1]) {
|
||||
return false;
|
||||
}
|
||||
node_idx++;
|
||||
|
||||
// GET_ROWS
|
||||
if (node_idx >= n_nodes || nodes[node_idx]->src[1] != nodes[node_idx - 1] ||
|
||||
nodes[node_idx]->src[0] != probs_reshaped) {
|
||||
return false;
|
||||
}
|
||||
node_idx++;
|
||||
|
||||
static const std::vector<ggml_op> remaining_ops = { GGML_OP_RESHAPE, GGML_OP_SOFT_MAX, GGML_OP_RESHAPE };
|
||||
|
||||
for (const ggml_op op : remaining_ops) {
|
||||
if (node_idx >= n_nodes || nodes[node_idx]->op != op || nodes[node_idx]->src[0] != nodes[node_idx - 1]) {
|
||||
return false;
|
||||
}
|
||||
node_idx++;
|
||||
}
|
||||
}
|
||||
|
||||
// At this point we can check for norm + scale. Everything is now at least valid till the norm
|
||||
if (node_idx >= n_nodes) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (nodes[node_idx]->op == GGML_OP_RESHAPE) {
|
||||
//check RESHAPE->SUM_ROWS->CLAMP->DIV->RESHAPE
|
||||
static const std::vector<ggml_op> norm_ops = { GGML_OP_RESHAPE, GGML_OP_SUM_ROWS, GGML_OP_CLAMP };
|
||||
|
||||
args.norm = true;
|
||||
for (const ggml_op op : norm_ops) {
|
||||
if (nodes[node_idx]->op == op && nodes[node_idx]->src[0] == nodes[node_idx - 1]) {
|
||||
node_idx++;
|
||||
} else {
|
||||
args.norm = false;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// DIV <- CLAMP, RESHAPE
|
||||
if (nodes[node_idx]->op != GGML_OP_DIV || nodes[node_idx]->src[1] != nodes[node_idx - 1] ||
|
||||
nodes[node_idx]->src[0] != nodes[node_idx - 3]) {
|
||||
args.norm = false;
|
||||
return true;
|
||||
}
|
||||
node_idx++;
|
||||
|
||||
if (nodes[node_idx]->op != GGML_OP_RESHAPE || nodes[node_idx]->src[0] != nodes[node_idx - 1]) {
|
||||
args.norm = false;
|
||||
return true;
|
||||
}
|
||||
|
||||
node_idx++;
|
||||
}
|
||||
|
||||
if (nodes[node_idx]->op == GGML_OP_SCALE && nodes[node_idx]->src[0] == nodes[node_idx - 1]) {
|
||||
args.scale = true;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph,
|
||||
int node_idx,
|
||||
std::initializer_list<enum ggml_op> ops,
|
||||
std::initializer_list<enum ggml_unary_op> unary_ops) {
|
||||
#ifndef NDEBUG
|
||||
const size_t num_unary = std::count(ops.begin(), ops.end(), GGML_OP_UNARY);
|
||||
GGML_ASSERT(unary_ops.size() == num_unary);
|
||||
#endif
|
||||
|
||||
//TODO: remove special case once ggml_can_fuse can handle empty nodes
|
||||
std::initializer_list<enum ggml_op> topk_moe_ops =
|
||||
ggml_cuda_topk_moe_ops(/*with_norm*/ false, /*delayed_softmax=*/false);
|
||||
std::initializer_list<enum ggml_op> topk_moe_ops_with_norm =
|
||||
ggml_cuda_topk_moe_ops(/*with_norm=*/true, /*delayed_softmax=*/false);
|
||||
std::initializer_list<enum ggml_op> topk_moe_ops_delayed_softmax =
|
||||
ggml_cuda_topk_moe_ops(/*with_norm=*/false, /*delayed_softmax=*/true);
|
||||
|
||||
const auto is_equal = [](const std::initializer_list<enum ggml_op> & list1,
|
||||
const std::initializer_list<enum ggml_op> & list2) {
|
||||
return std::equal(list1.begin(), list1.end(), list2.begin(), list2.end());
|
||||
};
|
||||
|
||||
if (is_equal(topk_moe_ops_with_norm, ops) &&
|
||||
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 9 })) {
|
||||
ggml_tensor * softmax = cgraph->nodes[node_idx];
|
||||
ggml_tensor * weights = cgraph->nodes[node_idx + 9];
|
||||
ggml_tensor * get_rows = cgraph->nodes[node_idx + 4];
|
||||
ggml_tensor * argsort = cgraph->nodes[node_idx + 2];
|
||||
int n_expert = cgraph->nodes[node_idx]->src[0]->ne[0];
|
||||
|
||||
if (ggml_cuda_should_use_topk_moe(softmax, weights, get_rows, argsort, nullptr, n_expert)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (is_equal(topk_moe_ops, ops) && ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 4 })) {
|
||||
ggml_tensor * softmax = cgraph->nodes[node_idx];
|
||||
ggml_tensor * weights = cgraph->nodes[node_idx + 4];
|
||||
ggml_tensor * get_rows = cgraph->nodes[node_idx + 4];
|
||||
ggml_tensor * argsort = cgraph->nodes[node_idx + 2];
|
||||
int n_expert = cgraph->nodes[node_idx]->src[0]->ne[0];
|
||||
|
||||
if (ggml_cuda_should_use_topk_moe(softmax, weights, get_rows, argsort, nullptr, n_expert)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (is_equal(topk_moe_ops_delayed_softmax, ops) &&
|
||||
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 1, node_idx + 5 })) {
|
||||
ggml_tensor * softmax = cgraph->nodes[node_idx + 4];
|
||||
ggml_tensor * weights = cgraph->nodes[node_idx + 5];
|
||||
ggml_tensor * get_rows = cgraph->nodes[node_idx + 2];
|
||||
ggml_tensor * argsort = cgraph->nodes[node_idx + 0];
|
||||
int n_expert = cgraph->nodes[node_idx]->src[0]->ne[0];
|
||||
|
||||
if (ggml_cuda_should_use_topk_moe(softmax, weights, get_rows, argsort, nullptr, n_expert)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
std::initializer_list<enum ggml_op> mul_mat_bias_glu_ops = { GGML_OP_MUL_MAT, GGML_OP_ADD, GGML_OP_MUL_MAT, GGML_OP_ADD, GGML_OP_GLU };
|
||||
std::initializer_list<enum ggml_op> mul_mat_id_bias_glu_ops = { GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID, GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID, GGML_OP_GLU };
|
||||
|
||||
@@ -3398,35 +3535,75 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
|
||||
// start of fusion operations
|
||||
static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
|
||||
if (!disable_fusion) {
|
||||
ggml_cuda_topk_moe_args args;
|
||||
|
||||
if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/*with norm*/ true), {})) {
|
||||
ggml_tensor * weights = cgraph->nodes[i + 9];
|
||||
ggml_tensor * selected_experts = cgraph->nodes[i + 3];
|
||||
ggml_tensor * clamp = cgraph->nodes[i + 7];
|
||||
ggml_cuda_op_topk_moe(*cuda_ctx, node->src[0], weights, selected_experts, /*with norm*/ true,
|
||||
/*delayed softmax*/ false, clamp);
|
||||
i += 9;
|
||||
continue;
|
||||
}
|
||||
if (cgraph->nodes[i]->op == GGML_OP_UNARY || cgraph->nodes[i]->op == GGML_OP_SOFT_MAX ||
|
||||
cgraph->nodes[i]->op == GGML_OP_ARGSORT) {
|
||||
const bool can_fuse = ggml_cuda_topk_moe_fusion(cgraph, i, args);
|
||||
|
||||
if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/*with norm*/ false), {})) {
|
||||
ggml_tensor * weights = cgraph->nodes[i + 4];
|
||||
ggml_tensor * selected_experts = cgraph->nodes[i + 3];
|
||||
ggml_cuda_op_topk_moe(*cuda_ctx, node->src[0], weights, selected_experts, /*with norm*/ false,
|
||||
/*delayed softmax*/ false);
|
||||
i += 4;
|
||||
continue;
|
||||
}
|
||||
std::vector<ggml_op> ops;
|
||||
|
||||
if (ggml_cuda_can_fuse(cgraph, i,
|
||||
ggml_cuda_topk_moe_ops(/*with norm*/ false, /*delayed softmax*/ true), {})) {
|
||||
ggml_tensor * weights = cgraph->nodes[i + 5];
|
||||
ggml_tensor * ids = cgraph->nodes[i + 1];
|
||||
if (can_fuse) {
|
||||
const ggml_tensor * logits = node->src[0];
|
||||
ggml_tensor * weights = nullptr;
|
||||
ggml_tensor * ids = nullptr;
|
||||
const ggml_tensor * bias = nullptr;
|
||||
const ggml_tensor * clamp = nullptr;
|
||||
const ggml_tensor * scale = nullptr;
|
||||
|
||||
ggml_cuda_op_topk_moe(*cuda_ctx, node->src[0], weights, ids, /*with norm*/ false,
|
||||
/*delayed_softmax*/ true);
|
||||
i += 5;
|
||||
continue;
|
||||
if (!args.delayed_softmax) {
|
||||
ggml_op gating_op = args.sigmoid ? GGML_OP_UNARY : GGML_OP_SOFT_MAX;
|
||||
int out_nodes[2]; // nodes which can't be elided
|
||||
|
||||
if (args.prob_bias) {
|
||||
bias = cgraph->nodes[i + 2]->src[1];
|
||||
ops.insert(ops.end(), { gating_op, GGML_OP_RESHAPE, GGML_OP_ADD, GGML_OP_ARGSORT,
|
||||
GGML_OP_VIEW, GGML_OP_GET_ROWS });
|
||||
out_nodes[0] = i + 4;
|
||||
ids = cgraph->nodes[i + 4];
|
||||
} else {
|
||||
ops.insert(ops.end(), { gating_op, GGML_OP_RESHAPE, GGML_OP_ARGSORT, GGML_OP_VIEW,
|
||||
GGML_OP_GET_ROWS });
|
||||
out_nodes[0] = i + 3;
|
||||
ids = cgraph->nodes[i + 3];
|
||||
}
|
||||
|
||||
if (args.norm) {
|
||||
ops.insert(ops.end(), { GGML_OP_RESHAPE, GGML_OP_SUM_ROWS, GGML_OP_CLAMP,
|
||||
GGML_OP_DIV, GGML_OP_RESHAPE });
|
||||
clamp = cgraph->nodes[i + ops.size() - 3];
|
||||
}
|
||||
if (args.scale) {
|
||||
ops.insert(ops.end(), { GGML_OP_SCALE });
|
||||
scale = cgraph->nodes[i + ops.size() - 1];
|
||||
}
|
||||
|
||||
weights = cgraph->nodes[i + ops.size() - 1];
|
||||
out_nodes[1] = i + ops.size() - 1;
|
||||
|
||||
if (ggml_can_fuse_subgraph(cgraph, i, ops.size(), ops.data(), out_nodes, 2) &&
|
||||
ggml_cuda_should_use_topk_moe(node, logits, weights, ids)) {
|
||||
ggml_cuda_op_topk_moe(*cuda_ctx, logits, weights, ids, clamp, scale, bias, args);
|
||||
i += ops.size() - 1;
|
||||
continue;
|
||||
}
|
||||
} else if (!args.norm && !args.prob_bias) {
|
||||
//special case gpt-oss, no norm, no bias.
|
||||
ops.insert(ops.end(), { GGML_OP_ARGSORT, GGML_OP_VIEW, GGML_OP_GET_ROWS,
|
||||
GGML_OP_RESHAPE, GGML_OP_SOFT_MAX, GGML_OP_RESHAPE });
|
||||
weights = cgraph->nodes[i + 5];
|
||||
ids = cgraph->nodes[i + 1];
|
||||
const ggml_tensor * softmax = cgraph->nodes[i + 4];
|
||||
|
||||
int out_nodes[2] = { i + 1, i + 5 };
|
||||
if (ggml_can_fuse_subgraph(cgraph, i, ops.size(), ops.data(), out_nodes, 2) &&
|
||||
ggml_cuda_should_use_topk_moe(softmax, logits, weights, ids)) {
|
||||
ggml_cuda_op_topk_moe(*cuda_ctx, logits, weights, ids, clamp, scale, bias, args);
|
||||
i += ops.size() - 1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, {})) {
|
||||
@@ -3733,14 +3910,14 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
|
||||
// Launch graph
|
||||
CUDA_CHECK(cudaGraphLaunch(graph->instance, cuda_ctx->stream()));
|
||||
#else
|
||||
GGML_UNUSED(graph_key);
|
||||
graph_evaluated_or_captured = true;
|
||||
#endif // USE_CUDA_GRAPH
|
||||
}
|
||||
}
|
||||
|
||||
static bool ggml_cuda_graph_set_enabled(ggml_backend_cuda_context * cuda_ctx, const void * graph_key) {
|
||||
|
||||
#ifdef USE_CUDA_GRAPH
|
||||
static bool ggml_cuda_graph_set_enabled(ggml_backend_cuda_context * cuda_ctx, const void * graph_key) {
|
||||
ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key);
|
||||
|
||||
if (graph->graph == nullptr) {
|
||||
@@ -3753,12 +3930,8 @@ static bool ggml_cuda_graph_set_enabled(ggml_backend_cuda_context * cuda_ctx, co
|
||||
}
|
||||
|
||||
return graph->is_enabled();
|
||||
#else
|
||||
GGML_UNUSED(cuda_ctx);
|
||||
GGML_UNUSED(graph_key);
|
||||
return false;
|
||||
#endif // USE_CUDA_GRAPH
|
||||
}
|
||||
#endif // USE_CUDA_GRAPH
|
||||
|
||||
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
|
||||
|
||||
@@ -333,7 +333,33 @@ namespace ggml_cuda_mma {
|
||||
|
||||
static __device__ __forceinline__ int get_j(const int l) {
|
||||
if constexpr (I == 16 && J == 8) {
|
||||
return 4 * (threadIdx.x / 16) + l;
|
||||
return ne * (threadIdx.x / 16) + l;
|
||||
} else {
|
||||
NO_DEVICE_CODE;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
#elif defined(AMD_MFMA_AVAILABLE)
|
||||
static constexpr int ne = I * J / 64;
|
||||
half2 x[ne] = {{0.0f, 0.0f}};
|
||||
|
||||
static constexpr __device__ bool supported() {
|
||||
if (I == 16 && J == 8) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int get_i(const int l) {
|
||||
if constexpr (I == 16 && J == 8) {
|
||||
return threadIdx.x % 16;
|
||||
} else {
|
||||
NO_DEVICE_CODE;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int get_j(const int l) {
|
||||
if constexpr (I == 16 && J == 8) {
|
||||
return ne * (threadIdx.x / 16) + l;
|
||||
} else {
|
||||
NO_DEVICE_CODE;
|
||||
return -1;
|
||||
@@ -391,7 +417,22 @@ namespace ggml_cuda_mma {
|
||||
static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR;
|
||||
|
||||
#if defined(AMD_WMMA_AVAILABLE)
|
||||
static constexpr int ne = I * J / 32;
|
||||
static constexpr int ne = tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR>::ne;
|
||||
nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
|
||||
|
||||
static constexpr __device__ bool supported() {
|
||||
return tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR>::supported();
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int get_i(const int l) {
|
||||
return tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR>::get_i(l);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int get_j(const int l) {
|
||||
return tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR>::get_j(l);
|
||||
}
|
||||
#elif defined(AMD_MFMA_AVAILABLE)
|
||||
static constexpr int ne = tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR>::ne;
|
||||
nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
|
||||
|
||||
static constexpr __device__ bool supported() {
|
||||
@@ -945,6 +986,32 @@ namespace ggml_cuda_mma {
|
||||
#endif // AMPERE_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
template <data_layout dl_ab, data_layout dl_d>
|
||||
static __device__ __forceinline__ void mma(
|
||||
tile<16, 16, float, dl_d> & D, const tile<16, 8, float, dl_ab> & A, const tile<16, 8, float, dl_ab> & B) {
|
||||
#ifdef AMD_MFMA_AVAILABLE
|
||||
using floatx4_t = __attribute__((ext_vector_type(4))) float;
|
||||
floatx4_t& acc_frag = reinterpret_cast<floatx4_t&>(D.x[0]);
|
||||
#if defined(CDNA3)
|
||||
using floatx2_t = __attribute__((ext_vector_type(2))) float;
|
||||
const floatx2_t& a_frag = reinterpret_cast<const floatx2_t&>(A.x[0]);
|
||||
const floatx2_t& b_frag = reinterpret_cast<const floatx2_t&>(B.x[0]);
|
||||
acc_frag = __builtin_amdgcn_mfma_f32_16x16x8_xf32(a_frag, b_frag, acc_frag, 0, 0, 0);
|
||||
#elif defined(CDNA2) || defined(CDNA1)
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
acc_frag = __builtin_amdgcn_mfma_f32_16x16x4f32(A.x[i], B.x[i], acc_frag, 0, 0, 0);
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED_VARS(D, A, B);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // defined(CDNA3)
|
||||
#else
|
||||
GGML_UNUSED_VARS(D, A, B);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // AMD_MFMA_AVAILABLE
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void mma_block_scaled(tile<16, 8, float> & D,
|
||||
const tile<16, 8, int> & A,
|
||||
const tile<8, 8, int> & B,
|
||||
@@ -1054,6 +1121,13 @@ namespace ggml_cuda_mma {
|
||||
GGML_UNUSED_VARS(D, A, B);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // RDNA4
|
||||
#elif defined(AMD_MFMA_AVAILABLE)
|
||||
using halfx4_t = __attribute__((ext_vector_type(4))) _Float16;
|
||||
using floatx4_t = __attribute__((ext_vector_type(4))) float;
|
||||
floatx4_t& acc_frag = reinterpret_cast<floatx4_t&>(D.x[0]);
|
||||
const halfx4_t& a_frag = reinterpret_cast<const halfx4_t&>(A.x[0]);
|
||||
const halfx4_t& b_frag = reinterpret_cast<const halfx4_t&>(B.x[0]);
|
||||
acc_frag = __builtin_amdgcn_mfma_f32_16x16x16f16(a_frag, b_frag, acc_frag, 0, 0, 0);
|
||||
#else
|
||||
GGML_UNUSED_VARS(D, A, B);
|
||||
NO_DEVICE_CODE;
|
||||
@@ -1081,11 +1155,31 @@ namespace ggml_cuda_mma {
|
||||
#else
|
||||
GGML_UNUSED_VARS(D, A, B);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // RDNA4
|
||||
#endif // defined(RDNA4)
|
||||
#elif defined(AMD_MFMA_AVAILABLE)
|
||||
using floatx4_t = __attribute__((ext_vector_type(4))) float;
|
||||
floatx4_t& acc_frag = reinterpret_cast<floatx4_t&>(D.x[0]);
|
||||
#if defined(CDNA3) || defined(CDNA2)
|
||||
using bf16x4_t = __attribute__((ext_vector_type(4))) __bf16;
|
||||
const bf16x4_t& a_frag = reinterpret_cast<const bf16x4_t&>(A.x[0]);
|
||||
const bf16x4_t& b_frag = reinterpret_cast<const bf16x4_t&>(B.x[0]);
|
||||
acc_frag = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(a_frag, b_frag, acc_frag, 0, 0, 0);
|
||||
#elif defined(CDNA1)
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
using bf16x2_t = __attribute__((ext_vector_type(2))) __bf16;
|
||||
const bf16x2_t& a_frag = reinterpret_cast<const bf16x2_t&>(A.x[i]);
|
||||
const bf16x2_t& b_frag = reinterpret_cast<const bf16x2_t&>(B.x[i]);
|
||||
acc_frag = __builtin_amdgcn_mfma_f32_16x16x8bf16(a_frag, b_frag, acc_frag, 0, 0, 0);
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED_VARS(D, A, B);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // AMPERE_MMA_AVAILABLE
|
||||
#endif // defined(CDNA3) || defined(CDNA2)
|
||||
#else
|
||||
GGML_UNUSED_VARS(D, A, B);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // defined(AMD_WMMA_AVAILABLE)
|
||||
}
|
||||
|
||||
template <data_layout dl_d, data_layout dl_ab>
|
||||
|
||||
@@ -2,6 +2,13 @@
|
||||
#include "mmf.cuh"
|
||||
#include "mmid.cuh"
|
||||
|
||||
static __forceinline__ int mmf_get_rows_per_block(const int cc) {
|
||||
if (GGML_CUDA_CC_IS_CDNA(cc)) {
|
||||
return MMF_ROWS_PER_BLOCK_CDNA;
|
||||
} else {
|
||||
return MMF_ROWS_PER_BLOCK;
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
|
||||
GGML_ASSERT( src1->type == GGML_TYPE_F32);
|
||||
@@ -89,28 +96,32 @@ void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * sr
|
||||
ids_info_ptr = &ids_info;
|
||||
}
|
||||
|
||||
const int device = ggml_cuda_get_device();
|
||||
const int cc = ggml_cuda_info().devices[device].cc;
|
||||
const int rows_per_block = mmf_get_rows_per_block(cc);
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: {
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
constexpr int vals_per_T = 1;
|
||||
mul_mat_f_switch_cols_per_block(
|
||||
src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, stride_col_y/vals_per_T, stride_col_dst,
|
||||
mul_mat_f_switch_rows_per_block<float>(
|
||||
rows_per_block, src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, stride_col_y/vals_per_T, stride_col_dst,
|
||||
ids_s0, ids_s1, ne02, nchannels_y, nchannels_dst, s02/vals_per_T, stride_channel_y, stride_channel_dst,
|
||||
ne03, ne3, s03/vals_per_T, s13, s3, ctx.stream(), ids_info_ptr);
|
||||
} break;
|
||||
case GGML_TYPE_F16: {
|
||||
const half2 * src0_d = (const half2 *) src0->data;
|
||||
constexpr int vals_per_T = 2;
|
||||
mul_mat_f_switch_cols_per_block(
|
||||
src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, stride_col_y/vals_per_T, stride_col_dst,
|
||||
mul_mat_f_switch_rows_per_block<half2>(
|
||||
rows_per_block, src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, stride_col_y/vals_per_T, stride_col_dst,
|
||||
ids_s0, ids_s1, ne02, nchannels_y, nchannels_dst, s02/vals_per_T, stride_channel_y, stride_channel_dst,
|
||||
ne03, ne3, s03/vals_per_T, s13, s3, ctx.stream(), ids_info_ptr);
|
||||
} break;
|
||||
case GGML_TYPE_BF16: {
|
||||
const nv_bfloat162 * src0_d = (const nv_bfloat162 *) src0->data;
|
||||
constexpr int vals_per_T = 2;
|
||||
mul_mat_f_switch_cols_per_block(
|
||||
src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, stride_col_y/vals_per_T, stride_col_dst,
|
||||
mul_mat_f_switch_rows_per_block<nv_bfloat162>(
|
||||
rows_per_block, src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, stride_col_y/vals_per_T, stride_col_dst,
|
||||
ids_s0, ids_s1, ne02, nchannels_y, nchannels_dst, s02/vals_per_T, stride_channel_y, stride_channel_dst,
|
||||
ne03, ne3, s03/vals_per_T, s13, s3, ctx.stream(), ids_info_ptr);
|
||||
} break;
|
||||
@@ -140,7 +151,11 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (src0_ne[1] % MMF_ROWS_PER_BLOCK != 0) {
|
||||
if (src0_ne[1] % mmf_get_rows_per_block(cc) != 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (GGML_CUDA_CC_IS_CDNA3(cc) && type == GGML_TYPE_BF16) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -153,6 +168,11 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
|
||||
} else {
|
||||
if (GGML_CUDA_CC_IS_RDNA3_0(cc) && src1_ncols > 8) {
|
||||
return false;
|
||||
} else if (GGML_CUDA_CC_IS_CDNA2(cc) && (type == GGML_TYPE_F16 || type == GGML_TYPE_BF16)) {
|
||||
//TODO: truse CDNA2 as CDNA1, tune the perf when CDNA2 is available.
|
||||
return false;
|
||||
} else if (GGML_CUDA_CC_IS_CDNA1(cc) && (type == GGML_TYPE_F16 || type == GGML_TYPE_BF16)) {
|
||||
return false;
|
||||
} else if (src1_ncols > 16) {
|
||||
return false;
|
||||
}
|
||||
@@ -160,11 +180,11 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
|
||||
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
return ampere_mma_available(cc);
|
||||
return ampere_mma_available(cc) || amd_mfma_available(cc);
|
||||
case GGML_TYPE_F16:
|
||||
return volta_mma_available(cc) || turing_mma_available(cc) || amd_wmma_available(cc);
|
||||
return volta_mma_available(cc) || turing_mma_available(cc) || amd_wmma_available(cc) || amd_mfma_available(cc);
|
||||
case GGML_TYPE_BF16:
|
||||
return ampere_mma_available(cc) || amd_wmma_available(cc);
|
||||
return ampere_mma_available(cc) || amd_wmma_available(cc) || amd_mfma_available(cc);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -7,6 +7,31 @@
|
||||
using namespace ggml_cuda_mma;
|
||||
|
||||
#define MMF_ROWS_PER_BLOCK 32
|
||||
#define MMF_ROWS_PER_BLOCK_CDNA 64
|
||||
|
||||
static __forceinline__ int64_t mmf_get_max_block_size(int cc) {
|
||||
if (GGML_CUDA_CC_IS_CDNA(cc)) {
|
||||
return 512;
|
||||
} else {
|
||||
return 256;
|
||||
}
|
||||
}
|
||||
|
||||
static __forceinline__ int mmf_get_padding(int cc) {
|
||||
if (GGML_CUDA_CC_IS_CDNA(cc)) {
|
||||
return 2;
|
||||
} else {
|
||||
return 4;
|
||||
}
|
||||
}
|
||||
|
||||
static constexpr __device__ int mmf_get_padding() {
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
return 2;
|
||||
#else
|
||||
return 4;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE)
|
||||
}
|
||||
|
||||
struct mmf_ids_data {
|
||||
const int32_t * ids_src_compact = nullptr;
|
||||
@@ -29,23 +54,25 @@ static __global__ void mul_mat_f(
|
||||
const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
|
||||
const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
|
||||
// TODO: handle this in a consistent and simpler way after AMD MFMA support has been added
|
||||
#if (!defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)) || defined(AMD_WMMA_AVAILABLE)
|
||||
#if defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
|
||||
#if defined(AMD_WMMA_AVAILABLE)
|
||||
// Special case for tf32, just dummy mma layout as wmma doesn't support it.
|
||||
constexpr bool is_tf32 = std::is_same_v<T, float>;
|
||||
constexpr int tile_B_I = is_tf32 ? 8 : 16;
|
||||
constexpr int tile_C_J = is_tf32 ? 8 : 16;
|
||||
constexpr data_layout ab_layout = is_tf32 ? DATA_LAYOUT_I_MAJOR : get_input_data_layout();
|
||||
typedef tile<16, 8, T, ab_layout> tile_A;
|
||||
typedef tile<tile_B_I, 8, T, ab_layout> tile_B;
|
||||
typedef tile<16, tile_C_J, float, DATA_LAYOUT_J_MAJOR> tile_C;
|
||||
if constexpr (!(std::is_same_v<T, half2> || std::is_same_v<T, nv_bfloat162>) || rows_per_block != MMF_ROWS_PER_BLOCK) {NO_DEVICE_CODE;} else {
|
||||
typedef tile<16, 8, T, get_input_data_layout()> tile_A;
|
||||
typedef tile<16, 8, T, get_input_data_layout()> tile_B;
|
||||
typedef tile<16, 16, float, DATA_LAYOUT_J_MAJOR> tile_C;
|
||||
#elif defined(AMD_MFMA_AVAILABLE)
|
||||
if constexpr (rows_per_block != MMF_ROWS_PER_BLOCK_CDNA) {NO_DEVICE_CODE;} else {
|
||||
typedef tile<16, 8, T, DATA_LAYOUT_I_MAJOR> tile_A;
|
||||
typedef tile<16, 8, T, DATA_LAYOUT_I_MAJOR> tile_B;
|
||||
typedef tile<16, 16, float, DATA_LAYOUT_J_MAJOR> tile_C;
|
||||
#else
|
||||
#ifdef VOLTA_MMA_AVAILABLE
|
||||
if constexpr (!std::is_same_v<T, half2>) {NO_DEVICE_CODE;} else {
|
||||
if constexpr (!std::is_same_v<T, half2> || rows_per_block != MMF_ROWS_PER_BLOCK) {NO_DEVICE_CODE;} else {
|
||||
typedef tile<32, 4, T, DATA_LAYOUT_I_MAJOR> tile_A;
|
||||
typedef tile< 8, 4, T, DATA_LAYOUT_I_MAJOR_MIRRORED> tile_B;
|
||||
typedef tile<32, 8, float, DATA_LAYOUT_I_MAJOR> tile_C;
|
||||
#else
|
||||
if constexpr (rows_per_block != MMF_ROWS_PER_BLOCK) {NO_DEVICE_CODE;} else {
|
||||
typedef tile<16, 8, T> tile_A;
|
||||
typedef tile<8, 8, T> tile_B;
|
||||
typedef tile<16, 8, float> tile_C;
|
||||
@@ -57,7 +84,7 @@ static __global__ void mul_mat_f(
|
||||
}
|
||||
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
constexpr int tile_k_padded = warp_size + 4;
|
||||
constexpr int tile_k_padded = warp_size + mmf_get_padding();
|
||||
constexpr int ntA = rows_per_block / tile_A::I;
|
||||
constexpr int ntB = (cols_per_block + tile_B::I - 1) / tile_B::I;
|
||||
|
||||
@@ -198,7 +225,7 @@ static __global__ void mul_mat_f(
|
||||
}
|
||||
|
||||
float * buf_iw = (float *) compute_base;
|
||||
constexpr int kiw = nwarps*rows_per_block + 4;
|
||||
constexpr int kiw = nwarps*rows_per_block + mmf_get_padding();
|
||||
|
||||
if (nwarps > 1) {
|
||||
__syncthreads();
|
||||
@@ -228,27 +255,34 @@ static __global__ void mul_mat_f(
|
||||
return;
|
||||
}
|
||||
|
||||
float sum = 0.0f;
|
||||
static_assert(rows_per_block == warp_size, "need loop/check");
|
||||
float sum[rows_per_block/warp_size] = {0.0f};
|
||||
static_assert((rows_per_block % warp_size) == 0, "rows_per_block must be a multiple of warp_size.");
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < nwarps*rows_per_block; i0 += rows_per_block) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i1 = 0; i1 < sizeof(sum)/sizeof(sum[0]); ++i1) {
|
||||
const int i = i0 + i1*warp_size + threadIdx.x;
|
||||
|
||||
sum += buf_iw[j*kiw + i];
|
||||
sum[i1] += buf_iw[j*kiw + i];
|
||||
}
|
||||
}
|
||||
|
||||
if constexpr (!has_ids) {
|
||||
dst[j*stride_col_dst + row0 + threadIdx.x] = sum;
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < sizeof(sum)/sizeof(sum[0]); ++i0) {
|
||||
dst[j*stride_col_dst + row0 + i0*warp_size + threadIdx.x] = sum[i0];
|
||||
}
|
||||
} else {
|
||||
const int slot = (j < cols_per_block) ? slot_map[j] : -1;
|
||||
if (slot >= 0 && (col_base + j) < ncols_dst_total) {
|
||||
dst[slot*stride_channel_dst + j*stride_col_dst + row0 + threadIdx.x] = sum;
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < sizeof(sum)/sizeof(sum[0]); ++i0) {
|
||||
dst[slot*stride_channel_dst + j*stride_col_dst + row0 + i0*warp_size + threadIdx.x] = sum[i0];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef VOLTA_MMA_AVAILABLE
|
||||
}
|
||||
#endif //VOLTA_MMA_AVAILABLE
|
||||
#else
|
||||
GGML_UNUSED_VARS(x, y, ids, dst,
|
||||
ncols, ncols_dst_total, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
@@ -256,7 +290,7 @@ static __global__ void mul_mat_f(
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // (!defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)) || defined(AMD_WMMA_AVAILABLE)
|
||||
#endif // defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
|
||||
}
|
||||
|
||||
//This kernel is for larger batch sizes of mul_mat_id
|
||||
@@ -271,23 +305,25 @@ static __global__ void mul_mat_f_ids(
|
||||
const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
|
||||
const uint3 sis1_fd, const uint3 nch_fd) {
|
||||
// TODO: handle this in a consistent and simpler way after AMD MFMA support has been added
|
||||
#if (!defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)) || defined(AMD_WMMA_AVAILABLE)
|
||||
#if defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
|
||||
#if defined(AMD_WMMA_AVAILABLE)
|
||||
// Special case for tf32, just dummy mma layout as wmma doesn't support it.
|
||||
constexpr bool is_tf32 = std::is_same_v<T, float>;
|
||||
constexpr int tile_B_I = is_tf32 ? 8 : 16;
|
||||
constexpr int tile_C_J = is_tf32 ? 8 : 16;
|
||||
constexpr data_layout ab_layout = is_tf32 ? DATA_LAYOUT_I_MAJOR : get_input_data_layout();
|
||||
typedef tile<16, 8, T, ab_layout> tile_A;
|
||||
typedef tile<tile_B_I, 8, T, ab_layout> tile_B;
|
||||
typedef tile<16, tile_C_J, float, DATA_LAYOUT_J_MAJOR> tile_C;
|
||||
if constexpr (!(std::is_same_v<T, half2> || std::is_same_v<T, nv_bfloat162>) || rows_per_block != MMF_ROWS_PER_BLOCK) {NO_DEVICE_CODE;} else {
|
||||
typedef tile<16, 8, T, get_input_data_layout()> tile_A;
|
||||
typedef tile<16, 8, T, get_input_data_layout()> tile_B;
|
||||
typedef tile<16, 16, float, DATA_LAYOUT_J_MAJOR> tile_C;
|
||||
#elif defined(AMD_MFMA_AVAILABLE)
|
||||
if constexpr (rows_per_block != MMF_ROWS_PER_BLOCK_CDNA) {NO_DEVICE_CODE;} else {
|
||||
typedef tile<16, 8, T, DATA_LAYOUT_I_MAJOR> tile_A;
|
||||
typedef tile<16, 8, T, DATA_LAYOUT_I_MAJOR> tile_B;
|
||||
typedef tile<16, 16, float, DATA_LAYOUT_J_MAJOR> tile_C;
|
||||
#else
|
||||
#ifdef VOLTA_MMA_AVAILABLE
|
||||
if constexpr (!std::is_same_v<T, half2>) {NO_DEVICE_CODE;} else {
|
||||
if constexpr (!std::is_same_v<T, half2> || rows_per_block != MMF_ROWS_PER_BLOCK) {NO_DEVICE_CODE;} else {
|
||||
typedef tile<32, 4, T, DATA_LAYOUT_I_MAJOR> tile_A;
|
||||
typedef tile< 8, 4, T, DATA_LAYOUT_I_MAJOR_MIRRORED> tile_B;
|
||||
typedef tile<32, 8, float, DATA_LAYOUT_I_MAJOR> tile_C;
|
||||
#else
|
||||
if constexpr (rows_per_block != MMF_ROWS_PER_BLOCK) {NO_DEVICE_CODE;} else {
|
||||
typedef tile<16, 8, T> tile_A;
|
||||
typedef tile<8, 8, T> tile_B;
|
||||
typedef tile<16, 8, float> tile_C;
|
||||
@@ -300,7 +336,7 @@ static __global__ void mul_mat_f_ids(
|
||||
|
||||
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
constexpr int tile_k_padded = warp_size + 4;
|
||||
constexpr int tile_k_padded = warp_size + mmf_get_padding();
|
||||
constexpr int ntA = rows_per_block / tile_A::I;
|
||||
constexpr int ntB = (cols_per_block + tile_B::I - 1) / tile_B::I;
|
||||
|
||||
@@ -467,7 +503,7 @@ static __global__ void mul_mat_f_ids(
|
||||
}
|
||||
|
||||
float * buf_iw = (float *) compute_base;
|
||||
constexpr int kiw = nwarps*rows_per_block + 4;
|
||||
constexpr int kiw = nwarps*rows_per_block + mmf_get_padding();
|
||||
|
||||
if (nwarps > 1) {
|
||||
__syncthreads();
|
||||
@@ -497,13 +533,16 @@ static __global__ void mul_mat_f_ids(
|
||||
return;
|
||||
}
|
||||
|
||||
float sum = 0.0f;
|
||||
static_assert(rows_per_block == warp_size, "need loop/check");
|
||||
float sum[rows_per_block/warp_size] = {0.0f};
|
||||
static_assert((rows_per_block % warp_size) == 0, "rows_per_block must be a multiple of warp_size.");
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < nwarps*rows_per_block; i0 += rows_per_block) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i1 = 0; i1 < sizeof(sum)/sizeof(sum[0]); ++i1) {
|
||||
const int i = i0 + i1*warp_size + threadIdx.x;
|
||||
|
||||
sum += buf_iw[j*kiw + i];
|
||||
sum[i1] += buf_iw[j * kiw + i];
|
||||
}
|
||||
}
|
||||
|
||||
const int global_j = col_base + j;
|
||||
@@ -513,23 +552,24 @@ static __global__ void mul_mat_f_ids(
|
||||
const int token = (int) qrm.x;
|
||||
if (token < ncols_dst_total) {
|
||||
const int slot = (int) qrm.y;
|
||||
dst[slot*stride_channel_dst + token*stride_col_dst + row0 + threadIdx.x] = sum;
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < sizeof(sum)/sizeof(sum[0]); ++i0) {
|
||||
dst[slot * stride_channel_dst + token * stride_col_dst + row0 + i0*warp_size + threadIdx.x] = sum[i0];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef VOLTA_MMA_AVAILABLE
|
||||
}
|
||||
#endif // VOLTA_MMA_AVAILABLE
|
||||
#else
|
||||
GGML_UNUSED_VARS(x, y, ids_src_compact, ids_dst_compact, expert_bounds, dst,
|
||||
ncols, ncols_dst_total, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, sis1_fd, nch_fd);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // (!defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)) || defined(AMD_WMMA_AVAILABLE)
|
||||
#endif // defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
|
||||
}
|
||||
|
||||
template<typename T, int cols_per_block, int nwarps>
|
||||
template<typename T, int rows_per_block, int cols_per_block, int nwarps>
|
||||
static inline void mul_mat_f_switch_ids(
|
||||
const T * x, const float * y, const int32_t * ids, float * dst,
|
||||
const int64_t ncols_x, const int64_t ncols_dst, const int64_t nchannels_dst,
|
||||
@@ -553,7 +593,7 @@ static inline void mul_mat_f_switch_ids(
|
||||
const uint3 sis1_fd = ids_data->sis1 > 0 ? init_fastdiv_values((uint32_t) ids_data->sis1) : make_uint3(0, 0, 1);
|
||||
const uint3 nch_fd = init_fastdiv_values((uint32_t) nchannels_dst);
|
||||
|
||||
mul_mat_f_ids<T, MMF_ROWS_PER_BLOCK, cols_per_block, nwarps><<<block_nums_ids, block_dims, nbytes_shared_total, stream>>>
|
||||
mul_mat_f_ids<T, rows_per_block, cols_per_block, nwarps><<<block_nums_ids, block_dims, nbytes_shared_total, stream>>>
|
||||
(x, y, ids_data->ids_src_compact, ids_data->ids_dst_compact, ids_data->expert_bounds_dev, dst,
|
||||
ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
@@ -564,19 +604,19 @@ static inline void mul_mat_f_switch_ids(
|
||||
dim3 block_nums_ids = block_nums;
|
||||
block_nums_ids.y *= col_tiles;
|
||||
|
||||
mul_mat_f<T, MMF_ROWS_PER_BLOCK, cols_per_block, nwarps, true><<<block_nums_ids, block_dims, nbytes_shared_total, stream>>>
|
||||
mul_mat_f<T, rows_per_block, cols_per_block, nwarps, true><<<block_nums_ids, block_dims, nbytes_shared_total, stream>>>
|
||||
(x, y, ids, dst, ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} else {
|
||||
mul_mat_f<T, MMF_ROWS_PER_BLOCK, cols_per_block, nwarps, false><<<block_nums, block_dims, nbytes_shared_total, stream>>>
|
||||
mul_mat_f<T, rows_per_block, cols_per_block, nwarps, false><<<block_nums, block_dims, nbytes_shared_total, stream>>>
|
||||
(x, y, ids, dst, ncols_x, cols_per_block, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, int cols_per_block>
|
||||
template <typename T, int rows_per_block, int cols_per_block>
|
||||
void mul_mat_f_cuda(
|
||||
const T * x, const float * y, const int32_t * ids, float * dst,
|
||||
const int64_t ncols_x, const int64_t nrows_x, const int64_t ncols_dst,
|
||||
@@ -605,7 +645,7 @@ void mul_mat_f_cuda(
|
||||
|
||||
int64_t nwarps_best = 1;
|
||||
int64_t niter_best = (ncols_x + warp_size*2 - 1) / (warp_size*2);
|
||||
int64_t max_block_size = 256;
|
||||
int64_t max_block_size = mmf_get_max_block_size(cc);
|
||||
for (int64_t nwarps = 2; nwarps <= max_block_size/warp_size; nwarps++) {
|
||||
const int64_t niter = (ncols_x + nwarps*warp_size*2 - 1) / (nwarps*warp_size*2);
|
||||
if (niter < niter_best) {
|
||||
@@ -614,10 +654,9 @@ void mul_mat_f_cuda(
|
||||
}
|
||||
}
|
||||
|
||||
constexpr int rows_per_block = MMF_ROWS_PER_BLOCK;
|
||||
const int nbytes_shared_iter = nwarps_best * (volta_mma_available(cc) ? tile_A_32::I : tile_A_16::I) * (warp_size + 4) * 4;
|
||||
const int nbytes_cols_per_block_pad = amd_wmma_available(cc) ? tile_B_16::I : tile_B_8::I;
|
||||
const int nbytes_shared_combine = GGML_PAD(cols_per_block, nbytes_cols_per_block_pad) * (nwarps_best*rows_per_block + 4) * 4;
|
||||
const int nbytes_shared_iter = nwarps_best * (volta_mma_available(cc) ? tile_A_32::I : tile_A_16::I) * (warp_size + mmf_get_padding(cc)) * 4;
|
||||
const int nbytes_cols_per_block_pad = (amd_wmma_available(cc) || amd_mfma_available(cc)) ? tile_B_16::I : tile_B_8::I;
|
||||
const int nbytes_shared_combine = GGML_PAD(cols_per_block, nbytes_cols_per_block_pad) * (nwarps_best*rows_per_block + mmf_get_padding(cc)) * 4;
|
||||
const int nbytes_shared = std::max(nbytes_shared_iter, nbytes_shared_combine);
|
||||
const int nbytes_slotmap = ids ? GGML_PAD(cols_per_block, 16) * sizeof(int) : 0;
|
||||
const int nbytes_shared_total = nbytes_shared + nbytes_slotmap;
|
||||
@@ -628,56 +667,56 @@ void mul_mat_f_cuda(
|
||||
|
||||
switch (nwarps_best) {
|
||||
case 1: {
|
||||
mul_mat_f_switch_ids<T, cols_per_block, 1>(
|
||||
mul_mat_f_switch_ids<T, rows_per_block, cols_per_block, 1>(
|
||||
x, y, ids, dst, ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream,
|
||||
ids_data);
|
||||
} break;
|
||||
case 2: {
|
||||
mul_mat_f_switch_ids<T, cols_per_block, 2>(
|
||||
mul_mat_f_switch_ids<T, rows_per_block, cols_per_block, 2>(
|
||||
x, y, ids, dst, ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream,
|
||||
ids_data);
|
||||
} break;
|
||||
case 3: {
|
||||
mul_mat_f_switch_ids<T, cols_per_block, 3>(
|
||||
mul_mat_f_switch_ids<T, rows_per_block, cols_per_block, 3>(
|
||||
x, y, ids, dst, ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream,
|
||||
ids_data);
|
||||
} break;
|
||||
case 4: {
|
||||
mul_mat_f_switch_ids<T, cols_per_block, 4>(
|
||||
mul_mat_f_switch_ids<T, rows_per_block, cols_per_block, 4>(
|
||||
x, y, ids, dst, ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream,
|
||||
ids_data);
|
||||
} break;
|
||||
case 5: {
|
||||
mul_mat_f_switch_ids<T, cols_per_block, 5>(
|
||||
mul_mat_f_switch_ids<T, rows_per_block, cols_per_block, 5>(
|
||||
x, y, ids, dst, ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream,
|
||||
ids_data);
|
||||
} break;
|
||||
case 6: {
|
||||
mul_mat_f_switch_ids<T, cols_per_block, 6>(
|
||||
mul_mat_f_switch_ids<T, rows_per_block, cols_per_block, 6>(
|
||||
x, y, ids, dst, ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream,
|
||||
ids_data);
|
||||
} break;
|
||||
case 7: {
|
||||
mul_mat_f_switch_ids<T, cols_per_block, 7>(
|
||||
mul_mat_f_switch_ids<T, rows_per_block, cols_per_block, 7>(
|
||||
x, y, ids, dst, ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream,
|
||||
ids_data);
|
||||
} break;
|
||||
case 8: {
|
||||
mul_mat_f_switch_ids<T, cols_per_block, 8>(
|
||||
mul_mat_f_switch_ids<T, rows_per_block, cols_per_block, 8>(
|
||||
x, y, ids, dst, ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream,
|
||||
@@ -691,7 +730,7 @@ void mul_mat_f_cuda(
|
||||
GGML_UNUSED_VARS(nchannels_y);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
template <typename T, int rows_per_block>
|
||||
static void mul_mat_f_switch_cols_per_block(
|
||||
const T * x, const float * y, const int32_t * ids, float * dst,
|
||||
const int64_t ncols_x, const int64_t nrows_x, const int64_t ncols_dst,
|
||||
@@ -708,82 +747,82 @@ static void mul_mat_f_switch_cols_per_block(
|
||||
|
||||
switch (ncols_case) {
|
||||
case 1: {
|
||||
mul_mat_f_cuda<T, 1>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_f_cuda<T, rows_per_block, 1>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream, ids_data);
|
||||
} break;
|
||||
case 2: {
|
||||
mul_mat_f_cuda<T, 2>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_f_cuda<T, rows_per_block, 2>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream, ids_data);
|
||||
} break;
|
||||
case 3: {
|
||||
mul_mat_f_cuda<T, 3>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_f_cuda<T, rows_per_block, 3>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream, ids_data);
|
||||
} break;
|
||||
case 4: {
|
||||
mul_mat_f_cuda<T, 4>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_f_cuda<T, rows_per_block, 4>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream, ids_data);
|
||||
} break;
|
||||
case 5: {
|
||||
mul_mat_f_cuda<T, 5>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_f_cuda<T, rows_per_block, 5>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream, ids_data);
|
||||
} break;
|
||||
case 6: {
|
||||
mul_mat_f_cuda<T, 6>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_f_cuda<T, rows_per_block, 6>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream, ids_data);
|
||||
} break;
|
||||
case 7: {
|
||||
mul_mat_f_cuda<T, 7>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_f_cuda<T, rows_per_block, 7>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream, ids_data);
|
||||
} break;
|
||||
case 8: {
|
||||
mul_mat_f_cuda<T, 8>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_f_cuda<T, rows_per_block, 8>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream, ids_data);
|
||||
} break;
|
||||
case 9: {
|
||||
mul_mat_f_cuda<T, 9>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_f_cuda<T, rows_per_block, 9>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream, ids_data);
|
||||
} break;
|
||||
case 10: {
|
||||
mul_mat_f_cuda<T, 10>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_f_cuda<T, rows_per_block, 10>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream, ids_data);
|
||||
} break;
|
||||
case 11: {
|
||||
mul_mat_f_cuda<T, 11>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_f_cuda<T, rows_per_block, 11>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream, ids_data);
|
||||
} break;
|
||||
case 12: {
|
||||
mul_mat_f_cuda<T, 12>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_f_cuda<T, rows_per_block, 12>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream, ids_data);
|
||||
} break;
|
||||
case 13: {
|
||||
mul_mat_f_cuda<T, 13>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_f_cuda<T, rows_per_block, 13>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream, ids_data);
|
||||
} break;
|
||||
case 14: {
|
||||
mul_mat_f_cuda<T, 14>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_f_cuda<T, rows_per_block, 14>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream, ids_data);
|
||||
} break;
|
||||
case 15: {
|
||||
mul_mat_f_cuda<T, 15>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_f_cuda<T, rows_per_block, 15>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream, ids_data);
|
||||
} break;
|
||||
case 16: {
|
||||
mul_mat_f_cuda<T, 16>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_f_cuda<T, rows_per_block, 16>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream, ids_data);
|
||||
} break;
|
||||
@@ -793,8 +832,36 @@ static void mul_mat_f_switch_cols_per_block(
|
||||
}
|
||||
}
|
||||
|
||||
#define DECL_MMF_CASE_HELPER(T, ncols_dst) \
|
||||
template void mul_mat_f_cuda<T, ncols_dst>( \
|
||||
template <typename T>
|
||||
static void mul_mat_f_switch_rows_per_block(
|
||||
const int rows_per_block, const T * x, const float * y, const int32_t * ids, float * dst,
|
||||
const int64_t ncols_x, const int64_t nrows_x, const int64_t ncols_dst,
|
||||
const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
|
||||
const int64_t stride_col_id, const int stride_row_id,
|
||||
const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
|
||||
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
|
||||
const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
|
||||
cudaStream_t stream, const mmf_ids_data * ids_data) {
|
||||
switch (rows_per_block) {
|
||||
case MMF_ROWS_PER_BLOCK: {
|
||||
mul_mat_f_switch_cols_per_block<T, MMF_ROWS_PER_BLOCK>(
|
||||
x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream, ids_data);
|
||||
} break;
|
||||
case MMF_ROWS_PER_BLOCK_CDNA: {
|
||||
mul_mat_f_switch_cols_per_block<T, MMF_ROWS_PER_BLOCK_CDNA>(
|
||||
x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream, ids_data);
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("unsupported rows_per_block: %i", rows_per_block);
|
||||
}
|
||||
}
|
||||
|
||||
#define DECL_MMF_CASE_HELPER(T, nrows_dst, ncols_dst) \
|
||||
template void mul_mat_f_cuda<T, nrows_dst, ncols_dst>( \
|
||||
const T * x, const float * y, const int32_t * ids, float * dst, \
|
||||
const int64_t ncols_x, const int64_t nrows_x, int64_t ncols_dst_total, const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst, \
|
||||
const int64_t stride_col_id, const int64_t stride_row_id, \
|
||||
@@ -803,16 +870,22 @@ static void mul_mat_f_switch_cols_per_block(
|
||||
const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, \
|
||||
cudaStream_t stream, const mmf_ids_data * ids_data);
|
||||
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
#if !defined(GGML_USE_MUSA)
|
||||
#define DECL_MMF_CASE_EXTERN(ncols_dst) \
|
||||
extern DECL_MMF_CASE_HELPER(float, ncols_dst) \
|
||||
extern DECL_MMF_CASE_HELPER(half2, ncols_dst) \
|
||||
extern DECL_MMF_CASE_HELPER(nv_bfloat162, ncols_dst)
|
||||
extern DECL_MMF_CASE_HELPER(float, MMF_ROWS_PER_BLOCK, ncols_dst) \
|
||||
extern DECL_MMF_CASE_HELPER(half2, MMF_ROWS_PER_BLOCK, ncols_dst) \
|
||||
extern DECL_MMF_CASE_HELPER(nv_bfloat162, MMF_ROWS_PER_BLOCK, ncols_dst) \
|
||||
extern DECL_MMF_CASE_HELPER(float, MMF_ROWS_PER_BLOCK_CDNA, ncols_dst) \
|
||||
extern DECL_MMF_CASE_HELPER(half2, MMF_ROWS_PER_BLOCK_CDNA, ncols_dst) \
|
||||
extern DECL_MMF_CASE_HELPER(nv_bfloat162, MMF_ROWS_PER_BLOCK_CDNA, ncols_dst)
|
||||
|
||||
#define DECL_MMF_CASE(ncols_dst) \
|
||||
DECL_MMF_CASE_HELPER(float, ncols_dst) \
|
||||
DECL_MMF_CASE_HELPER(half2, ncols_dst) \
|
||||
DECL_MMF_CASE_HELPER(nv_bfloat162, ncols_dst)
|
||||
DECL_MMF_CASE_HELPER(float, MMF_ROWS_PER_BLOCK, ncols_dst) \
|
||||
DECL_MMF_CASE_HELPER(half2, MMF_ROWS_PER_BLOCK, ncols_dst) \
|
||||
DECL_MMF_CASE_HELPER(nv_bfloat162, MMF_ROWS_PER_BLOCK, ncols_dst) \
|
||||
DECL_MMF_CASE_HELPER(float, MMF_ROWS_PER_BLOCK_CDNA, ncols_dst) \
|
||||
DECL_MMF_CASE_HELPER(half2, MMF_ROWS_PER_BLOCK_CDNA, ncols_dst) \
|
||||
DECL_MMF_CASE_HELPER(nv_bfloat162, MMF_ROWS_PER_BLOCK_CDNA, ncols_dst)
|
||||
|
||||
DECL_MMF_CASE_EXTERN(1);
|
||||
DECL_MMF_CASE_EXTERN(2);
|
||||
|
||||
@@ -5,6 +5,13 @@
|
||||
#include <cmath>
|
||||
#include <initializer_list>
|
||||
|
||||
// Kernel config struct - passed by value to CUDA kernel
|
||||
struct topk_moe_config {
|
||||
bool use_sigmoid;
|
||||
bool with_norm;
|
||||
bool delayed_softmax;
|
||||
};
|
||||
|
||||
// Warp-local softmax used for both the pre-top-k logits and the post-top-k delayed path.
|
||||
template <int experts_per_thread, bool use_limit>
|
||||
__device__ void softmax_warp_inplace(float (&vals)[experts_per_thread], const int limit, const int lane) {
|
||||
@@ -50,6 +57,16 @@ __device__ void softmax_warp_inplace(float (&vals)[experts_per_thread], const in
|
||||
}
|
||||
}
|
||||
|
||||
template <int experts_per_thread, bool use_limit>
|
||||
__device__ void sigmoid_warp_inplace(float (&vals)[experts_per_thread], const int limit, const int lane) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < experts_per_thread; i++) {
|
||||
const int idx = lane + i * WARP_SIZE;
|
||||
const bool active = !use_limit || (idx < limit);
|
||||
vals[i] = active ? 1.f / (1.f + expf(-vals[i])) : -INFINITY;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
This kernel does the following:
|
||||
1. optionally softmax over the logits per token [n_experts, n_tokens]
|
||||
@@ -59,13 +76,16 @@ __device__ void softmax_warp_inplace(float (&vals)[experts_per_thread], const in
|
||||
|
||||
It is intended as fusion of softmax->top-k->get_rows pipeline for MoE models
|
||||
*/
|
||||
template <int n_experts, bool with_norm, bool delayed_softmax = false>
|
||||
__launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float * logits,
|
||||
float * weights,
|
||||
int32_t * ids,
|
||||
const int n_rows,
|
||||
const int n_expert_used,
|
||||
const float clamp_val) {
|
||||
template <int n_experts, bool has_bias>
|
||||
__launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float * logits,
|
||||
float * weights,
|
||||
int32_t * ids,
|
||||
float * bias,
|
||||
const int n_rows,
|
||||
const int n_expert_used,
|
||||
const float clamp_val,
|
||||
const float scale_val,
|
||||
const topk_moe_config config) {
|
||||
const int row = blockIdx.x * blockDim.y + threadIdx.y;
|
||||
if (row >= n_rows) {
|
||||
return;
|
||||
@@ -79,14 +99,41 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
|
||||
|
||||
float wt[experts_per_thread];
|
||||
|
||||
// Initialize all slots to -INFINITY
|
||||
#pragma unroll
|
||||
for (int i = 0; i < experts_per_thread; i++) {
|
||||
wt[i] = -INFINITY;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < n_experts; i += WARP_SIZE) {
|
||||
const int expert = i + threadIdx.x;
|
||||
wt[i / WARP_SIZE] = (n_experts % WARP_SIZE == 0 || expert < n_experts) ? logits[expert] : -INFINITY;
|
||||
}
|
||||
|
||||
if constexpr (!delayed_softmax) {
|
||||
softmax_warp_inplace<experts_per_thread, false>(wt, n_experts, threadIdx.x);
|
||||
if (!config.delayed_softmax) {
|
||||
if (config.use_sigmoid) {
|
||||
sigmoid_warp_inplace<experts_per_thread, false>(wt, n_experts, threadIdx.x);
|
||||
} else {
|
||||
softmax_warp_inplace<experts_per_thread, false>(wt, n_experts, threadIdx.x);
|
||||
}
|
||||
}
|
||||
|
||||
// selection_wt is only needed when bias is present (selection uses wt + bias)
|
||||
// when no bias, we use wt directly for both selection and weight values
|
||||
float selection_wt[has_bias ? experts_per_thread : 1];
|
||||
|
||||
if constexpr (has_bias) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < experts_per_thread; i++) {
|
||||
selection_wt[i] = -INFINITY;
|
||||
}
|
||||
#pragma unroll
|
||||
for (int i = 0; i < n_experts; i += WARP_SIZE) {
|
||||
const int expert = i + threadIdx.x;
|
||||
selection_wt[i / WARP_SIZE] =
|
||||
(n_experts % WARP_SIZE == 0 || expert < n_experts) ? wt[i / WARP_SIZE] + bias[expert] : -INFINITY;
|
||||
}
|
||||
}
|
||||
|
||||
//at this point, each thread holds either a portion of the softmax distribution
|
||||
@@ -106,22 +153,56 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
|
||||
float max_val = wt[0];
|
||||
int max_expert = threadIdx.x;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 1; i < experts_per_thread; i++) {
|
||||
const int expert = threadIdx.x + i * WARP_SIZE;
|
||||
if ((n_experts % WARP_SIZE == 0 || expert < n_experts) && wt[i] > max_val) {
|
||||
max_val = wt[i];
|
||||
max_expert = expert;
|
||||
}
|
||||
}
|
||||
if constexpr (has_bias) {
|
||||
float max_val_s = selection_wt[0];
|
||||
|
||||
#pragma unroll
|
||||
for (int mask = WARP_SIZE / 2; mask > 0; mask /= 2) {
|
||||
const float val = __shfl_xor_sync(0xFFFFFFFF, max_val, mask, WARP_SIZE);
|
||||
const int expert = __shfl_xor_sync(0xFFFFFFFF, max_expert, mask, WARP_SIZE);
|
||||
if (val > max_val || (val == max_val && expert < max_expert)) {
|
||||
max_val = val;
|
||||
max_expert = expert;
|
||||
for (int i = 1; i < experts_per_thread; i++) {
|
||||
const int expert = threadIdx.x + i * WARP_SIZE;
|
||||
if ((n_experts % WARP_SIZE == 0 || expert < n_experts) && selection_wt[i] > max_val_s) {
|
||||
max_val = wt[i];
|
||||
max_val_s = selection_wt[i];
|
||||
max_expert = expert;
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int mask = WARP_SIZE / 2; mask > 0; mask /= 2) {
|
||||
const float val = __shfl_xor_sync(0xFFFFFFFF, max_val, mask, WARP_SIZE);
|
||||
const float val_s = __shfl_xor_sync(0xFFFFFFFF, max_val_s, mask, WARP_SIZE);
|
||||
const int expert = __shfl_xor_sync(0xFFFFFFFF, max_expert, mask, WARP_SIZE);
|
||||
if (val_s > max_val_s || (val_s == max_val_s && expert < max_expert)) {
|
||||
max_val = val;
|
||||
max_val_s = val_s;
|
||||
max_expert = expert;
|
||||
}
|
||||
}
|
||||
|
||||
if ((max_expert & (WARP_SIZE - 1)) == threadIdx.x) {
|
||||
selection_wt[max_expert / WARP_SIZE] = -INFINITY;
|
||||
}
|
||||
} else {
|
||||
#pragma unroll
|
||||
for (int i = 1; i < experts_per_thread; i++) {
|
||||
const int expert = threadIdx.x + i * WARP_SIZE;
|
||||
if ((n_experts % WARP_SIZE == 0 || expert < n_experts) && wt[i] > max_val) {
|
||||
max_val = wt[i];
|
||||
max_expert = expert;
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int mask = WARP_SIZE / 2; mask > 0; mask /= 2) {
|
||||
const float val = __shfl_xor_sync(0xFFFFFFFF, max_val, mask, WARP_SIZE);
|
||||
const int expert = __shfl_xor_sync(0xFFFFFFFF, max_expert, mask, WARP_SIZE);
|
||||
if (val > max_val || (val == max_val && expert < max_expert)) {
|
||||
max_val = val;
|
||||
max_expert = expert;
|
||||
}
|
||||
}
|
||||
|
||||
if ((max_expert & (WARP_SIZE - 1)) == threadIdx.x) {
|
||||
wt[max_expert / WARP_SIZE] = -INFINITY;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -130,16 +211,14 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
|
||||
}
|
||||
|
||||
if ((max_expert & (WARP_SIZE - 1)) == threadIdx.x) {
|
||||
wt[max_expert / WARP_SIZE] = -INFINITY;
|
||||
|
||||
ids[k] = max_expert;
|
||||
if constexpr (with_norm) {
|
||||
if (config.with_norm) {
|
||||
wt_sum += max_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if constexpr (with_norm) {
|
||||
if (config.with_norm) {
|
||||
wt_sum = warp_reduce_sum(wt_sum);
|
||||
wt_sum = max(wt_sum, clamp_val);
|
||||
const float inv_sum = 1.0f / wt_sum;
|
||||
@@ -149,7 +228,7 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
|
||||
}
|
||||
}
|
||||
|
||||
if constexpr (delayed_softmax) {
|
||||
if (config.delayed_softmax) {
|
||||
softmax_warp_inplace<experts_per_thread, true>(output_weights, n_expert_used, threadIdx.x);
|
||||
}
|
||||
|
||||
@@ -157,25 +236,25 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
|
||||
for (int i = 0; i < experts_per_thread; i++) {
|
||||
const int idx = i * WARP_SIZE + threadIdx.x;
|
||||
if (idx < n_expert_used) {
|
||||
weights[idx] = output_weights[i];
|
||||
weights[idx] = output_weights[i] * scale_val;
|
||||
}
|
||||
}
|
||||
|
||||
if (!with_norm) {
|
||||
GGML_UNUSED(clamp_val);
|
||||
}
|
||||
}
|
||||
|
||||
template <bool with_norm, bool delayed_softmax = false>
|
||||
template<bool has_bias>
|
||||
static void launch_topk_moe_cuda(ggml_backend_cuda_context & ctx,
|
||||
const float * logits,
|
||||
float * weights,
|
||||
int32_t * ids,
|
||||
float * bias,
|
||||
const int n_rows,
|
||||
const int n_expert,
|
||||
const int n_expert_used,
|
||||
const float clamp_val) {
|
||||
static_assert(!(with_norm && delayed_softmax), "delayed softmax is not supported with weight normalization");
|
||||
const float clamp_val,
|
||||
const float scale_val,
|
||||
const topk_moe_config config) {
|
||||
GGML_ASSERT(!(config.with_norm && config.delayed_softmax) &&
|
||||
"delayed softmax is not supported with weight normalization");
|
||||
const int rows_per_block = 4;
|
||||
dim3 grid_dims((n_rows + rows_per_block - 1) / rows_per_block, 1, 1);
|
||||
dim3 block_dims(WARP_SIZE, rows_per_block, 1);
|
||||
@@ -183,44 +262,48 @@ static void launch_topk_moe_cuda(ggml_backend_cuda_context & ctx,
|
||||
|
||||
switch (n_expert) {
|
||||
case 1:
|
||||
topk_moe_cuda<1, with_norm, delayed_softmax>
|
||||
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
|
||||
topk_moe_cuda<1, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
|
||||
clamp_val, scale_val, config);
|
||||
break;
|
||||
case 2:
|
||||
topk_moe_cuda<2, with_norm, delayed_softmax>
|
||||
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
|
||||
topk_moe_cuda<2, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
|
||||
clamp_val, scale_val, config);
|
||||
break;
|
||||
case 4:
|
||||
topk_moe_cuda<4, with_norm, delayed_softmax>
|
||||
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
|
||||
topk_moe_cuda<4, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
|
||||
clamp_val, scale_val, config);
|
||||
break;
|
||||
case 8:
|
||||
topk_moe_cuda<8, with_norm, delayed_softmax>
|
||||
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
|
||||
topk_moe_cuda<8, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
|
||||
clamp_val, scale_val, config);
|
||||
break;
|
||||
case 16:
|
||||
topk_moe_cuda<16, with_norm, delayed_softmax>
|
||||
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
|
||||
topk_moe_cuda<16, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
|
||||
clamp_val, scale_val, config);
|
||||
break;
|
||||
case 32:
|
||||
topk_moe_cuda<32, with_norm, delayed_softmax>
|
||||
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
|
||||
topk_moe_cuda<32, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
|
||||
clamp_val, scale_val, config);
|
||||
break;
|
||||
case 64:
|
||||
topk_moe_cuda<64, with_norm, delayed_softmax>
|
||||
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
|
||||
topk_moe_cuda<64, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
|
||||
clamp_val, scale_val, config);
|
||||
break;
|
||||
case 128:
|
||||
topk_moe_cuda<128, with_norm, delayed_softmax>
|
||||
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
|
||||
topk_moe_cuda<128, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
|
||||
clamp_val, scale_val, config);
|
||||
break;
|
||||
case 256:
|
||||
topk_moe_cuda<256, with_norm, delayed_softmax>
|
||||
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
|
||||
topk_moe_cuda<256, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
|
||||
clamp_val, scale_val, config);
|
||||
break;
|
||||
case 512:
|
||||
topk_moe_cuda<512, with_norm, delayed_softmax>
|
||||
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
|
||||
topk_moe_cuda<512, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
|
||||
clamp_val, scale_val, config);
|
||||
break;
|
||||
case 576:
|
||||
topk_moe_cuda<576, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
|
||||
clamp_val, scale_val, config);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false && "fatal error");
|
||||
@@ -228,13 +311,14 @@ static void launch_topk_moe_cuda(ggml_backend_cuda_context & ctx,
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
|
||||
const ggml_tensor * logits,
|
||||
ggml_tensor * weights,
|
||||
ggml_tensor * ids,
|
||||
const bool with_norm,
|
||||
const bool delayed_softmax,
|
||||
ggml_tensor * clamp) {
|
||||
void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
|
||||
const ggml_tensor * logits,
|
||||
ggml_tensor * weights,
|
||||
ggml_tensor * ids,
|
||||
const ggml_tensor * clamp,
|
||||
const ggml_tensor * scale,
|
||||
const ggml_tensor * bias,
|
||||
const ggml_cuda_topk_moe_args & args) {
|
||||
GGML_ASSERT(logits->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(weights->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
||||
@@ -245,107 +329,75 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
|
||||
const float * logits_d = (const float *) logits->data;
|
||||
float * weights_d = (float *) weights->data;
|
||||
int32_t * ids_d = (int32_t *) ids->data;
|
||||
float * bias_d = bias ? (float *) bias->data : nullptr;
|
||||
|
||||
float scale_val = scale ? ggml_get_op_params_f32(scale, 0) : 1.0f;
|
||||
|
||||
GGML_ASSERT(ids->nb[1] / ggml_type_size(ids->type) == (size_t) n_experts);
|
||||
|
||||
const int n_expert_used = weights->ne[1];
|
||||
|
||||
const bool with_norm = clamp != nullptr;
|
||||
|
||||
float clamp_val = -INFINITY;
|
||||
if (with_norm) {
|
||||
if (clamp) {
|
||||
clamp_val = ggml_get_op_params_f32(clamp, 0);
|
||||
}
|
||||
launch_topk_moe_cuda<true>(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used, clamp_val);
|
||||
if (clamp) {
|
||||
clamp_val = ggml_get_op_params_f32(clamp, 0);
|
||||
}
|
||||
|
||||
topk_moe_config config;
|
||||
config.use_sigmoid = args.sigmoid;
|
||||
config.with_norm = with_norm;
|
||||
config.delayed_softmax = args.delayed_softmax;
|
||||
|
||||
if (bias) {
|
||||
launch_topk_moe_cuda<true>(ctx, logits_d, weights_d, ids_d, bias_d, n_rows, n_experts, n_expert_used, clamp_val,
|
||||
scale_val, config);
|
||||
} else {
|
||||
GGML_ASSERT(clamp == nullptr);
|
||||
if (delayed_softmax) {
|
||||
launch_topk_moe_cuda<false, true>(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used,
|
||||
clamp_val);
|
||||
} else {
|
||||
launch_topk_moe_cuda<false, false>(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used,
|
||||
clamp_val);
|
||||
}
|
||||
launch_topk_moe_cuda<false>(ctx, logits_d, weights_d, ids_d, bias_d, n_rows, n_experts, n_expert_used, clamp_val,
|
||||
scale_val, config);
|
||||
}
|
||||
}
|
||||
|
||||
bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax,
|
||||
bool ggml_cuda_should_use_topk_moe(const ggml_tensor * gating_op,
|
||||
const ggml_tensor * weights,
|
||||
const ggml_tensor * get_rows,
|
||||
const ggml_tensor * argsort,
|
||||
const ggml_tensor * clamp,
|
||||
int n_expert) {
|
||||
ggml_tensor * probs = get_rows->src[0];
|
||||
if (probs->op != GGML_OP_RESHAPE) {
|
||||
return false;
|
||||
}
|
||||
probs = probs->src[0];
|
||||
ggml_tensor * selection_probs = argsort->src[0];
|
||||
|
||||
if (probs != selection_probs) {
|
||||
const ggml_tensor * logits,
|
||||
const ggml_tensor * ids) {
|
||||
const int n_expert = ids->nb[1] / ids->nb[0];
|
||||
if (((n_expert & (n_expert - 1)) != 0 || n_expert > 512) && n_expert != 576) {
|
||||
return false;
|
||||
}
|
||||
|
||||
float scale = 1.0f;
|
||||
float max_bias = 0.0f;
|
||||
|
||||
memcpy(&scale, (const float *) softmax->op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (const float *) softmax->op_params + 1, sizeof(float));
|
||||
|
||||
if (!ggml_is_contiguous(softmax->src[0]) || !ggml_is_contiguous(weights)) {
|
||||
if (!ggml_is_contiguous(weights) || !ggml_is_contiguous(logits)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (scale != 1.0f || max_bias != 0.0f) {
|
||||
return false;
|
||||
}
|
||||
if (gating_op->op == GGML_OP_SOFT_MAX) {
|
||||
const ggml_tensor * softmax = gating_op;
|
||||
float scale = 1.0f;
|
||||
float max_bias = 0.0f;
|
||||
|
||||
// don't fuse when masks or sinks are present
|
||||
if (softmax->src[1] || softmax->src[2]) {
|
||||
return false;
|
||||
}
|
||||
memcpy(&scale, (const float *) softmax->op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (const float *) softmax->op_params + 1, sizeof(float));
|
||||
|
||||
// n_expert must be a power of 2
|
||||
if ((n_expert & (n_expert - 1)) != 0 || n_expert > 512) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (clamp) {
|
||||
if (clamp->op != GGML_OP_CLAMP) {
|
||||
if (!ggml_is_contiguous(softmax->src[0])) {
|
||||
return false;
|
||||
}
|
||||
float max_val = ggml_get_op_params_f32(clamp, 1);
|
||||
|
||||
if (max_val != INFINITY) {
|
||||
if (scale != 1.0f || max_bias != 0.0f) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// don't fuse when masks or sinks are present
|
||||
if (softmax->src[1] || softmax->src[2]) {
|
||||
return false;
|
||||
}
|
||||
} else if (gating_op->op == GGML_OP_UNARY) {
|
||||
ggml_unary_op op = ggml_get_unary_op(gating_op);
|
||||
|
||||
if (op != GGML_UNARY_OP_SIGMOID) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
std::initializer_list<enum ggml_op> ggml_cuda_topk_moe_ops(bool norm, bool delayed_softmax) {
|
||||
static std::initializer_list<enum ggml_op> norm_ops = { GGML_OP_SOFT_MAX, GGML_OP_RESHAPE, GGML_OP_ARGSORT,
|
||||
GGML_OP_VIEW, GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
|
||||
GGML_OP_SUM_ROWS, GGML_OP_CLAMP, GGML_OP_DIV,
|
||||
GGML_OP_RESHAPE };
|
||||
|
||||
static std::initializer_list<enum ggml_op> no_norm_ops = { GGML_OP_SOFT_MAX, GGML_OP_RESHAPE, GGML_OP_ARGSORT,
|
||||
GGML_OP_VIEW, GGML_OP_GET_ROWS };
|
||||
|
||||
static std::initializer_list<enum ggml_op> delayed_softmax_ops = { GGML_OP_ARGSORT, GGML_OP_VIEW,
|
||||
GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
|
||||
GGML_OP_SOFT_MAX, GGML_OP_RESHAPE };
|
||||
|
||||
GGML_ASSERT(!norm || !delayed_softmax);
|
||||
|
||||
if (delayed_softmax) {
|
||||
return delayed_softmax_ops;
|
||||
}
|
||||
|
||||
if (norm) {
|
||||
return norm_ops;
|
||||
}
|
||||
|
||||
return no_norm_ops;
|
||||
}
|
||||
|
||||
@@ -3,19 +3,25 @@
|
||||
|
||||
#include <initializer_list>
|
||||
|
||||
void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
|
||||
const ggml_tensor * logits,
|
||||
ggml_tensor * weights,
|
||||
ggml_tensor * ids,
|
||||
const bool with_norm,
|
||||
const bool delayed_softmax = false,
|
||||
ggml_tensor * weight_clamp = nullptr);
|
||||
struct ggml_cuda_topk_moe_args {
|
||||
bool sigmoid{};
|
||||
bool softmax{};
|
||||
bool delayed_softmax{};
|
||||
bool prob_bias{};
|
||||
bool norm{};
|
||||
bool scale{};
|
||||
};
|
||||
|
||||
bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax,
|
||||
void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
|
||||
const ggml_tensor * logits,
|
||||
ggml_tensor * weights,
|
||||
ggml_tensor * ids,
|
||||
const ggml_tensor * clamp,
|
||||
const ggml_tensor * scale,
|
||||
const ggml_tensor * bias,
|
||||
const ggml_cuda_topk_moe_args & args);
|
||||
|
||||
bool ggml_cuda_should_use_topk_moe(const ggml_tensor * gating_op,
|
||||
const ggml_tensor * weights,
|
||||
const ggml_tensor * get_rows,
|
||||
const ggml_tensor * argsort,
|
||||
const ggml_tensor * clamp,
|
||||
int n_expert);
|
||||
|
||||
std::initializer_list<enum ggml_op> ggml_cuda_topk_moe_ops(bool with_norm, bool delayed_softmax = false);
|
||||
const ggml_tensor * logits,
|
||||
const ggml_tensor * ids);
|
||||
|
||||
@@ -1,7 +1,17 @@
|
||||
file(TO_CMAKE_PATH "${HEXAGON_SDK_ROOT}" HEXAGON_SDK_ROOT)
|
||||
file(TO_CMAKE_PATH "${HEXAGON_TOOLS_ROOT}" HEXAGON_TOOLS_ROOT)
|
||||
|
||||
if (NOT IS_DIRECTORY "${HEXAGON_SDK_ROOT}" OR NOT IS_DIRECTORY "${HEXAGON_TOOLS_ROOT}")
|
||||
message(FATAL_ERROR "Make sure HEXAGON_SDK_ROOT and HEXAGON_TOOLS_ROOT point to the correct Hexagon SDK installation.")
|
||||
endif()
|
||||
|
||||
message(STATUS "hexagon: using ${HEXAGON_SDK_ROOT} and ${HEXAGON_TOOLS_ROOT} for building libggml-htp skels")
|
||||
|
||||
include(${HEXAGON_SDK_ROOT}/build/cmake/hexagon_fun.cmake)
|
||||
include(ExternalProject)
|
||||
|
||||
option(GGML_HEXAGON_HTP_DEBUG "ggml-hexagon: enable HTP debug output" OFF)
|
||||
set(GGML_HEXAGON_HTP_CERT "$ENV{HEXAGON_HTP_CERT}" CACHE PATH "ggml-hexagon: enable HTP library signing using certificate")
|
||||
set(GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE 128 CACHE STRING "ggml-hexagon: quantize group size (32, 64, or 128)")
|
||||
|
||||
add_library(htp_iface OBJECT
|
||||
@@ -25,56 +35,71 @@ else()
|
||||
target_link_options(htp_iface PUBLIC -ldl)
|
||||
endif()
|
||||
|
||||
link_custom_library(htp_iface cdsprpc)
|
||||
link_custom_library(htp_iface rpcmem)
|
||||
|
||||
set(TARGET_NAME ggml-hexagon)
|
||||
ggml_add_backend_library(${TARGET_NAME}
|
||||
ggml-hexagon.cpp htp-utils.c htp-utils.h ../../include/ggml-hexagon.h)
|
||||
ggml-hexagon.cpp
|
||||
htp-drv.cpp
|
||||
htp-drv.h
|
||||
libdl.h
|
||||
../../include/ggml-hexagon.h)
|
||||
|
||||
target_link_libraries(${TARGET_NAME} PRIVATE htp_iface)
|
||||
target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/htp ${CMAKE_CURRENT_BINARY_DIR})
|
||||
|
||||
# Build HTP bits
|
||||
set(HTP_CMAKE_ARGS
|
||||
-DCMAKE_TOOLCHAIN_FILE=${CMAKE_CURRENT_SOURCE_DIR}/htp/cmake-toolchain.cmake
|
||||
-DCMAKE_BUILD_TYPE=Release
|
||||
-DCMAKE_INSTALL_LIBDIR=${CMAKE_CURRENT_BINARY_DIR}
|
||||
-DHEXAGON_SDK_ROOT=$ENV{HEXAGON_SDK_ROOT}
|
||||
-DHEXAGON_TOOLS_ROOT=$ENV{HEXAGON_TOOLS_ROOT}
|
||||
-DHEXAGON_HTP_DEBUG=${GGML_HEXAGON_HTP_DEBUG}
|
||||
-DGGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE})
|
||||
# Build HTP skels
|
||||
set(HTP_SKELS)
|
||||
function(build_htp_skel V)
|
||||
ExternalProject_Add(htp-${V}
|
||||
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
|
||||
BUILD_BYPRODUCTS ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-${V}.so
|
||||
CMAKE_ARGS
|
||||
-DCMAKE_BUILD_TYPE=Release
|
||||
-DCMAKE_TOOLCHAIN_FILE=${CMAKE_CURRENT_SOURCE_DIR}/htp/cmake-toolchain.cmake
|
||||
-DCMAKE_INSTALL_LIBDIR=${CMAKE_CURRENT_BINARY_DIR}
|
||||
-DHEXAGON_SDK_ROOT=${HEXAGON_SDK_ROOT}
|
||||
-DHEXAGON_TOOLS_ROOT=${HEXAGON_TOOLS_ROOT}
|
||||
-DHEXAGON_HTP_DEBUG=${GGML_HEXAGON_HTP_DEBUG}
|
||||
-DGGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE}
|
||||
-DDSP_VERSION=${V}
|
||||
-DPREBUILT_LIB_DIR="toolv19_${V}")
|
||||
list(APPEND HTP_SKELS ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-${V}.so)
|
||||
set(HTP_SKELS ${HTP_SKELS} PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
ExternalProject_Add(htp-v68
|
||||
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
|
||||
CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v68 -DPREBUILT_LIB_DIR="toolv19_v68")
|
||||
|
||||
ExternalProject_Add(htp-v69
|
||||
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
|
||||
CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v69 -DPREBUILT_LIB_DIR="toolv19_v69")
|
||||
|
||||
ExternalProject_Add(htp-v73
|
||||
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
|
||||
CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v73 -DPREBUILT_LIB_DIR="toolv19_v73")
|
||||
|
||||
ExternalProject_Add(htp-v75
|
||||
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
|
||||
CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v75 -DPREBUILT_LIB_DIR="toolv19_v75")
|
||||
|
||||
ExternalProject_Add(htp-v79
|
||||
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
|
||||
CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v79 -DPREBUILT_LIB_DIR="toolv19_v79")
|
||||
|
||||
ExternalProject_Add(htp-v81
|
||||
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
|
||||
CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v81 -DPREBUILT_LIB_DIR="toolv19_v81")
|
||||
build_htp_skel(v68)
|
||||
build_htp_skel(v69)
|
||||
build_htp_skel(v73)
|
||||
build_htp_skel(v75)
|
||||
build_htp_skel(v79)
|
||||
build_htp_skel(v81)
|
||||
|
||||
# Install Hexagon skels required at runtime
|
||||
install(FILES
|
||||
${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v68.so
|
||||
${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v69.so
|
||||
${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v73.so
|
||||
${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v75.so
|
||||
${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v79.so
|
||||
${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v81.so
|
||||
TYPE LIB)
|
||||
install(FILES ${HTP_SKELS} TYPE LIB)
|
||||
|
||||
if (CMAKE_SYSTEM_NAME MATCHES Windows AND GGML_HEXAGON_HTP_CERT)
|
||||
file(TO_CMAKE_PATH "$ENV{WINDOWS_SDK_BIN}/arm64" WINSDK_BIN0_ARM64)
|
||||
file(TO_CMAKE_PATH "$ENV{WINDOWS_SDK_BIN}/x86" WINSDK_BIN0_X86)
|
||||
file(TO_CMAKE_PATH "$ENV{WindowsSdkVerBinPath}/arm64" WINSDK_BIN1_ARM64)
|
||||
file(TO_CMAKE_PATH "$ENV{WindowsSdkVerBinPath}/x86" WINSDK_BIN1_X86)
|
||||
|
||||
set(WINSDK_PATHS ${WINSDK_BIN0_ARM64} ${WINSDK_BIN0_X86} ${WINSDK_BIN1_ARM64} ${WINSDK_BIN1_X86})
|
||||
|
||||
find_program(INF2CAT NAMES inf2cat.exe PATHS ${WINSDK_PATHS} REQUIRED)
|
||||
find_program(SIGNTOOL NAMES signtool.exe PATHS ${WINSDK_PATHS} REQUIRED)
|
||||
|
||||
message(STATUS "hexagon: using ${GGML_HEXAGON_HTP_CERT} to sign libggml-htp skels")
|
||||
|
||||
set(LIBGGML_HTP_CAT ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp.cat)
|
||||
add_custom_target(libggml-htp-cat
|
||||
BYPRODUCTS ${LIBGGML_HTP_CAT}
|
||||
DEPENDS libggml-htp.inf ${HTP_SKELS}
|
||||
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/libggml-htp.inf ${CMAKE_CURRENT_BINARY_DIR}
|
||||
COMMAND ${INF2CAT} /driver:${CMAKE_CURRENT_BINARY_DIR} /os:10_25H2_ARM64
|
||||
COMMAND ${SIGNTOOL} sign /fd sha256 /f ${GGML_HEXAGON_HTP_CERT} ${LIBGGML_HTP_CAT}
|
||||
COMMENT "generating and signing libggml-htp.cat file"
|
||||
VERBATIM
|
||||
)
|
||||
|
||||
add_dependencies(${TARGET_NAME} libggml-htp-cat)
|
||||
install(FILES ${LIBGGML_HTP_CAT} TYPE LIB)
|
||||
endif()
|
||||
|
||||
@@ -14,9 +14,6 @@
|
||||
|
||||
#ifdef _WIN32
|
||||
# include <sal.h>
|
||||
# ifndef _WINDOWS
|
||||
# define _WINDOWS
|
||||
# endif
|
||||
#else
|
||||
# include <semaphore.h>
|
||||
# include <unistd.h>
|
||||
@@ -25,8 +22,6 @@
|
||||
#pragma clang diagnostic ignored "-Wnested-anon-types"
|
||||
#pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
|
||||
|
||||
#include "htp-utils.h"
|
||||
|
||||
#include <AEEStdErr.h>
|
||||
#include <dspqueue.h>
|
||||
#include <rpcmem.h>
|
||||
@@ -40,6 +35,7 @@
|
||||
#include "op-desc.h"
|
||||
#include "htp-msg.h"
|
||||
#include "htp_iface.h"
|
||||
#include "htp-drv.h"
|
||||
|
||||
static size_t opt_ndev = 1;
|
||||
static size_t opt_nhvx = 0; // use all
|
||||
@@ -150,9 +146,9 @@ void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_
|
||||
0, // flags - the framework will autoset this
|
||||
n_bufs, // number of buffers
|
||||
bufs, // buffer references
|
||||
sizeof(req),
|
||||
sizeof(req), // Message length
|
||||
(const uint8_t *) &req, // Message
|
||||
1000000 // Timeout
|
||||
DSPQUEUE_TIMEOUT // Timeout
|
||||
);
|
||||
|
||||
if (err != 0) {
|
||||
@@ -182,13 +178,13 @@ void ggml_hexagon_session::flush() {
|
||||
|
||||
// Read response packet from queue
|
||||
int err = dspqueue_read(q, &flags,
|
||||
HTP_MAX_PACKET_BUFFERS, // Maximum number of buffer references
|
||||
&n_bufs, // Number of buffer references
|
||||
bufs, // Buffer references
|
||||
sizeof(rsp), // Max message length
|
||||
&rsp_size, // Message length
|
||||
(uint8_t *) &rsp,
|
||||
1000000); // Timeout
|
||||
HTP_MAX_PACKET_BUFFERS, // Maximum number of buffer references
|
||||
&n_bufs, // Number of buffer references
|
||||
bufs, // Buffer references
|
||||
sizeof(rsp), // Max message length
|
||||
&rsp_size, // Message length
|
||||
(uint8_t *) &rsp, // Message
|
||||
DSPQUEUE_TIMEOUT); // Timeout
|
||||
|
||||
if (err == AEE_EEXPIRED) {
|
||||
// TODO: might need to bail out if the HTP is stuck on something
|
||||
@@ -269,13 +265,7 @@ struct ggml_backend_hexagon_buffer_context {
|
||||
ggml_backend_hexagon_buffer_context(ggml_hexagon_session * sess, size_t size, bool repack) {
|
||||
size += 4 * 1024; // extra page for padding
|
||||
|
||||
if (rpcmem_alloc2) {
|
||||
this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
|
||||
} else {
|
||||
GGML_LOG_INFO("ggml-hex: %s rpcmem_alloc2 not found, falling back to rpcmem_alloc\n", sess->name.c_str());
|
||||
this->base = (uint8_t *) rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
|
||||
}
|
||||
|
||||
this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
|
||||
if (!this->base) {
|
||||
GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer : size %zu\n", sess->name.c_str(), size);
|
||||
throw std::runtime_error("ggml-hex: rpcmem_alloc failed (see log for details)");
|
||||
@@ -2461,12 +2451,12 @@ static void ggml_backend_hexagon_free(ggml_backend_t backend) {
|
||||
}
|
||||
|
||||
static inline bool op_reuse_src1(const ggml_tensor * op1, const ggml_tensor * op0) {
|
||||
return (op0 && op0->src[1] == op1->src[1] && ggml_is_quantized(op0->src[0]->type) && ggml_is_quantized(op1->src[1]->type));
|
||||
return (op0 && op0->src[1] == op1->src[1] && ggml_is_quantized(op0->src[0]->type));
|
||||
}
|
||||
|
||||
static inline bool is_compute_op(ggml_tensor *node)
|
||||
{
|
||||
return !(ggml_op_is_empty(node->op) || ggml_is_empty(node));
|
||||
return !ggml_op_is_empty(node->op) && !ggml_is_empty(node) && (node->flags & GGML_TENSOR_FLAG_COMPUTE);
|
||||
}
|
||||
|
||||
// scan the graph and figure out last compute op index
|
||||
@@ -2488,7 +2478,7 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
|
||||
|
||||
const int last = last_compute_op(graph);
|
||||
|
||||
const struct ggml_tensor * prev_quant_op = nullptr; // prev executed op with quantizer
|
||||
const struct ggml_tensor * prev_op = nullptr; // prev executed op
|
||||
|
||||
for (int i = 0; i < graph->n_nodes; ++i) {
|
||||
ggml_tensor * node = graph->nodes[i];
|
||||
@@ -2497,17 +2487,15 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
uint32_t flags = 0;
|
||||
|
||||
// skip quantizer if src1 is reused
|
||||
if (op_reuse_src1(node, prev_quant_op)) {
|
||||
if (op_reuse_src1(node, prev_op)) {
|
||||
flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
|
||||
}
|
||||
|
||||
prev_op = node;
|
||||
|
||||
// ask for early notification for the last Op
|
||||
if (i == last) {
|
||||
flags |= HTP_OPFLAGS_EARLY_WAKEUP;
|
||||
@@ -2520,7 +2508,6 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
|
||||
} else {
|
||||
ggml_hexagon_dispatch_op<init_binary_req<false>>(sess, node, flags);
|
||||
}
|
||||
prev_quant_op = node;
|
||||
break;
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
if (ggml_is_quantized(node->src[0]->type)) {
|
||||
@@ -2528,7 +2515,6 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
|
||||
} else {
|
||||
ggml_hexagon_dispatch_op<init_binary_id_req<false>>(sess, node, flags);
|
||||
}
|
||||
prev_quant_op = node;
|
||||
break;
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_ADD:
|
||||
@@ -2670,7 +2656,7 @@ static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<no
|
||||
}
|
||||
|
||||
// that many nodes forward to search for stackable nodes that can reuse VTCM
|
||||
constexpr int N_FORWARD = 8;
|
||||
constexpr int N_FORWARD = 16;
|
||||
|
||||
for (int i1 = i0 + 1; i1 < i0 + N_FORWARD && i1 < n; i1++) {
|
||||
if (used[i1]) {
|
||||
@@ -3056,10 +3042,12 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(__ANDROID__)
|
||||
if (opt_arch < 75) {
|
||||
opt_ndev = 1;
|
||||
GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
GGML_LOG_INFO("ggml-hex: Hexagon Arch version v%d\n", opt_arch);
|
||||
|
||||
@@ -3156,6 +3144,8 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
|
||||
opt_arch = strtoul(str_arch, NULL, 0);
|
||||
}
|
||||
|
||||
opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : 1;
|
||||
|
||||
reg->context = new ggml_hexagon_registry(reg);
|
||||
|
||||
HEX_VERBOSE("ggml-hex: size-of-general-req %zu size-of-general-rsp %zu\n", sizeof(struct htp_general_req),
|
||||
@@ -3180,6 +3170,11 @@ ggml_backend_reg_t ggml_backend_hexagon_reg(void) {
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
if (!initialized) {
|
||||
auto nErr = htpdrv_init();
|
||||
if (nErr != AEE_SUCCESS) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ggml_hexagon_init(®);
|
||||
}
|
||||
|
||||
|
||||
418
ggml/src/ggml-hexagon/htp-drv.cpp
Normal file
418
ggml/src/ggml-hexagon/htp-drv.cpp
Normal file
@@ -0,0 +1,418 @@
|
||||
// sample drv interface
|
||||
|
||||
#pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
|
||||
#pragma clang diagnostic ignored "-Wmissing-prototypes"
|
||||
#pragma clang diagnostic ignored "-Wsign-compare"
|
||||
|
||||
#include <filesystem>
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#ifdef _WIN32
|
||||
# define WIN32_LEAN_AND_MEAN
|
||||
# ifndef NOMINMAX
|
||||
# define NOMINMAX
|
||||
# endif
|
||||
# include <windows.h>
|
||||
# include <winevt.h>
|
||||
#else
|
||||
# include <dlfcn.h>
|
||||
# include <unistd.h>
|
||||
#endif
|
||||
#include "ggml-impl.h"
|
||||
#include "htp-drv.h"
|
||||
#include "libdl.h"
|
||||
|
||||
#include <domain.h>
|
||||
|
||||
//
|
||||
// Driver API types
|
||||
//
|
||||
|
||||
typedef void * (*rpcmem_alloc_pfn_t)(int heapid, uint32_t flags, int size);
|
||||
typedef void * (*rpcmem_alloc2_pfn_t)(int heapid, uint32_t flags, size_t size);
|
||||
typedef void (*rpcmem_free_pfn_t)(void * po);
|
||||
typedef int (*rpcmem_to_fd_pfn_t)(void * po);
|
||||
|
||||
typedef AEEResult (*dspqueue_create_pfn_t)(int domain,
|
||||
uint32_t flags,
|
||||
uint32_t req_queue_size,
|
||||
uint32_t resp_queue_size,
|
||||
dspqueue_callback_t packet_callback,
|
||||
dspqueue_callback_t error_callback,
|
||||
void * callback_context,
|
||||
dspqueue_t * queue);
|
||||
typedef AEEResult (*dspqueue_close_pfn_t)(dspqueue_t queue);
|
||||
typedef AEEResult (*dspqueue_export_pfn_t)(dspqueue_t queue, uint64_t *queue_id);
|
||||
typedef AEEResult (*dspqueue_write_pfn_t)(dspqueue_t queue, uint32_t flags,
|
||||
uint32_t num_buffers,
|
||||
struct dspqueue_buffer *buffers,
|
||||
uint32_t message_length,
|
||||
const uint8_t *message,
|
||||
uint32_t timeout_us);
|
||||
typedef AEEResult (*dspqueue_read_pfn_t)(dspqueue_t queue, uint32_t *flags,
|
||||
uint32_t max_buffers, uint32_t *num_buffers,
|
||||
struct dspqueue_buffer *buffers,
|
||||
uint32_t max_message_length,
|
||||
uint32_t *message_length, uint8_t *message,
|
||||
uint32_t timeout_us);
|
||||
|
||||
typedef int (*fastrpc_mmap_pfn_t)(int domain, int fd, void *addr, int offset, size_t length, enum fastrpc_map_flags flags);
|
||||
typedef int (*fastrpc_munmap_pfn_t)(int domain, int fd, void *addr, size_t length);
|
||||
|
||||
typedef int (*remote_handle64_open_pfn_t)(const char* name, remote_handle64 *ph);
|
||||
typedef int (*remote_handle64_invoke_pfn_t)(remote_handle64 h, uint32_t dwScalars, remote_arg *pra);
|
||||
typedef int (*remote_handle64_close_pfn_t)(remote_handle h);
|
||||
typedef int (*remote_handle_control_pfn_t)(uint32_t req, void* data, uint32_t datalen);
|
||||
typedef int (*remote_handle64_control_pfn_t)(remote_handle64 h, uint32_t req, void* data, uint32_t datalen);
|
||||
typedef int (*remote_session_control_pfn_t)(uint32_t req, void *data, uint32_t datalen);
|
||||
|
||||
//
|
||||
// Driver API pfns
|
||||
//
|
||||
|
||||
rpcmem_alloc_pfn_t rpcmem_alloc_pfn = nullptr;
|
||||
rpcmem_alloc2_pfn_t rpcmem_alloc2_pfn = nullptr;
|
||||
rpcmem_free_pfn_t rpcmem_free_pfn = nullptr;
|
||||
rpcmem_to_fd_pfn_t rpcmem_to_fd_pfn = nullptr;
|
||||
|
||||
fastrpc_mmap_pfn_t fastrpc_mmap_pfn = nullptr;
|
||||
fastrpc_munmap_pfn_t fastrpc_munmap_pfn = nullptr;
|
||||
|
||||
dspqueue_create_pfn_t dspqueue_create_pfn = nullptr;
|
||||
dspqueue_close_pfn_t dspqueue_close_pfn = nullptr;
|
||||
dspqueue_export_pfn_t dspqueue_export_pfn = nullptr;
|
||||
dspqueue_write_pfn_t dspqueue_write_pfn = nullptr;
|
||||
dspqueue_read_pfn_t dspqueue_read_pfn = nullptr;
|
||||
|
||||
remote_handle64_open_pfn_t remote_handle64_open_pfn = nullptr;
|
||||
remote_handle64_invoke_pfn_t remote_handle64_invoke_pfn = nullptr;
|
||||
remote_handle64_close_pfn_t remote_handle64_close_pfn = nullptr;
|
||||
remote_handle_control_pfn_t remote_handle_control_pfn = nullptr;
|
||||
remote_handle64_control_pfn_t remote_handle64_control_pfn = nullptr;
|
||||
remote_session_control_pfn_t remote_session_control_pfn = nullptr;
|
||||
|
||||
//
|
||||
// Driver API
|
||||
//
|
||||
|
||||
void * rpcmem_alloc(int heapid, uint32_t flags, int size) {
|
||||
return rpcmem_alloc_pfn(heapid, flags, size);
|
||||
}
|
||||
|
||||
void * rpcmem_alloc2(int heapid, uint32_t flags, size_t size) {
|
||||
if (rpcmem_alloc2_pfn) {
|
||||
return rpcmem_alloc2_pfn(heapid, flags, size);
|
||||
} else {
|
||||
GGML_LOG_INFO("ggml-hex: rpcmem_alloc2 not found, falling back to rpcmem_alloc\n");
|
||||
return rpcmem_alloc_pfn(heapid, flags, size);
|
||||
}
|
||||
}
|
||||
|
||||
void rpcmem_free(void * po) {
|
||||
return rpcmem_free_pfn(po);
|
||||
}
|
||||
|
||||
int rpcmem_to_fd(void * po) {
|
||||
return rpcmem_to_fd_pfn(po);
|
||||
}
|
||||
|
||||
HTPDRV_API int fastrpc_mmap(int domain, int fd, void * addr, int offset, size_t length, enum fastrpc_map_flags flags) {
|
||||
return fastrpc_mmap_pfn(domain, fd, addr, offset, length, flags);
|
||||
}
|
||||
|
||||
HTPDRV_API int fastrpc_munmap(int domain, int fd, void * addr, size_t length) {
|
||||
return fastrpc_munmap_pfn(domain, fd, addr, length);
|
||||
}
|
||||
|
||||
AEEResult dspqueue_create(int domain,
|
||||
uint32_t flags,
|
||||
uint32_t req_queue_size,
|
||||
uint32_t resp_queue_size,
|
||||
dspqueue_callback_t packet_callback,
|
||||
dspqueue_callback_t error_callback,
|
||||
void * callback_context,
|
||||
dspqueue_t * queue) {
|
||||
return dspqueue_create_pfn(domain, flags, req_queue_size, resp_queue_size, packet_callback, error_callback,
|
||||
callback_context, queue);
|
||||
}
|
||||
|
||||
AEEResult dspqueue_close(dspqueue_t queue) {
|
||||
return dspqueue_close_pfn(queue);
|
||||
}
|
||||
|
||||
AEEResult dspqueue_export(dspqueue_t queue, uint64_t * queue_id) {
|
||||
return dspqueue_export_pfn(queue, queue_id);
|
||||
}
|
||||
|
||||
AEEResult dspqueue_write(dspqueue_t queue,
|
||||
uint32_t flags,
|
||||
uint32_t num_buffers,
|
||||
struct dspqueue_buffer * buffers,
|
||||
uint32_t message_length,
|
||||
const uint8_t * message,
|
||||
uint32_t timeout_us) {
|
||||
return dspqueue_write_pfn(queue, flags, num_buffers, buffers, message_length, message, timeout_us);
|
||||
}
|
||||
|
||||
AEEResult dspqueue_read(dspqueue_t queue,
|
||||
uint32_t * flags,
|
||||
uint32_t max_buffers,
|
||||
uint32_t * num_buffers,
|
||||
struct dspqueue_buffer * buffers,
|
||||
uint32_t max_message_length,
|
||||
uint32_t * message_length,
|
||||
uint8_t * message,
|
||||
uint32_t timeout_us) {
|
||||
return dspqueue_read_pfn(queue, flags, max_buffers, num_buffers, buffers, max_message_length, message_length,
|
||||
message, timeout_us);
|
||||
}
|
||||
|
||||
HTPDRV_API int remote_handle64_open(const char * name, remote_handle64 * ph) {
|
||||
return remote_handle64_open_pfn(name, ph);
|
||||
}
|
||||
|
||||
HTPDRV_API int remote_handle64_invoke(remote_handle64 h, uint32_t dwScalars, remote_arg * pra) {
|
||||
return remote_handle64_invoke_pfn(h, dwScalars, pra);
|
||||
}
|
||||
|
||||
HTPDRV_API int remote_handle64_close(remote_handle64 h) {
|
||||
return remote_handle64_close_pfn(h);
|
||||
}
|
||||
|
||||
HTPDRV_API int remote_handle_control(uint32_t req, void * data, uint32_t datalen) {
|
||||
return remote_handle_control_pfn(req, data, datalen);
|
||||
}
|
||||
|
||||
HTPDRV_API int remote_handle64_control(remote_handle64 h, uint32_t req, void * data, uint32_t datalen) {
|
||||
return remote_handle64_control_pfn(h, req, data, datalen);
|
||||
}
|
||||
|
||||
HTPDRV_API int remote_session_control(uint32_t req, void * data, uint32_t datalen) {
|
||||
return remote_session_control_pfn(req, data, datalen);
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
static std::string wstr_to_str(std::wstring_view wstr) {
|
||||
std::string result;
|
||||
if (wstr.empty()) {
|
||||
return result;
|
||||
}
|
||||
auto bytes_needed = WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS,
|
||||
wstr.data(), (int) wstr.size(),
|
||||
nullptr, 0, nullptr, nullptr);
|
||||
if (bytes_needed == 0) {
|
||||
GGML_LOG_ERROR("ggml-hex: WideCharToMultiByte failed. Error %lu\n", GetLastError());
|
||||
throw std::runtime_error("Invalid wstring input");
|
||||
}
|
||||
|
||||
result.resize(bytes_needed, '\0');
|
||||
int bytes_written = WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS,
|
||||
wstr.data(), (int) wstr.size(),
|
||||
result.data(), bytes_needed,
|
||||
nullptr, nullptr);
|
||||
if (bytes_written == 0) {
|
||||
GGML_LOG_ERROR("ggml-hex: WideCharToMultiByte failed. Error %lu\n", GetLastError());
|
||||
throw std::runtime_error("Wstring conversion failed");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static std::string get_driver_path() {
|
||||
std::wstring serviceName = L"qcnspmcdm";
|
||||
std::string result;
|
||||
|
||||
// Get a handle to the SCM database.
|
||||
SC_HANDLE schSCManager = OpenSCManagerW(NULL, NULL, STANDARD_RIGHTS_READ);
|
||||
if (nullptr == schSCManager) {
|
||||
GGML_LOG_ERROR("ggml-hex: Failed to open SCManager. Error: %lu\n", GetLastError());
|
||||
return result;
|
||||
}
|
||||
|
||||
// Get a handle to the service.
|
||||
SC_HANDLE schService = OpenServiceW(schSCManager, // SCM database
|
||||
serviceName.c_str(), // name of service
|
||||
SERVICE_QUERY_CONFIG); // need query config access
|
||||
|
||||
if (nullptr == schService) {
|
||||
GGML_LOG_ERROR("ggml-hex: Failed to open qcnspmcdm service. Error: %lu\n", GetLastError());
|
||||
CloseServiceHandle(schSCManager);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Store the size of buffer used as an output.
|
||||
DWORD bufferSize;
|
||||
if (!QueryServiceConfigW(schService, NULL, 0, &bufferSize) &&
|
||||
(GetLastError() != ERROR_INSUFFICIENT_BUFFER)) {
|
||||
GGML_LOG_ERROR("ggml-hex: Failed to query service config. Error: %lu\n", GetLastError());
|
||||
CloseServiceHandle(schService);
|
||||
CloseServiceHandle(schSCManager);
|
||||
return result;
|
||||
}
|
||||
// Get the configuration of the service.
|
||||
LPQUERY_SERVICE_CONFIGW serviceConfig =
|
||||
static_cast<LPQUERY_SERVICE_CONFIGW>(LocalAlloc(LMEM_FIXED, bufferSize));
|
||||
if (!QueryServiceConfigW(schService, serviceConfig, bufferSize, &bufferSize)) {
|
||||
fprintf(stderr, "ggml-hex: Failed to query service config. Error: %lu\n", GetLastError());
|
||||
LocalFree(serviceConfig);
|
||||
CloseServiceHandle(schService);
|
||||
CloseServiceHandle(schSCManager);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Read the driver file path get its parent directory
|
||||
std::wstring driverPath = std::wstring(serviceConfig->lpBinaryPathName);
|
||||
driverPath = driverPath.substr(0, driverPath.find_last_of(L"\\"));
|
||||
|
||||
// Clean up resources
|
||||
LocalFree(serviceConfig);
|
||||
CloseServiceHandle(schService);
|
||||
CloseServiceHandle(schSCManager);
|
||||
|
||||
// Driver path would contain invalid path string, like:
|
||||
// \SystemRoot\System32\DriverStore\FileRepository\qcadsprpc8280.inf_arm64_c2b9460c9a072f37
|
||||
// "\SystemRoot" should be replace with a correct one (e.g. C:\Windows)
|
||||
const std::wstring systemRootPlaceholder = L"\\SystemRoot";
|
||||
if (0 != driverPath.compare(0, systemRootPlaceholder.length(), systemRootPlaceholder)) {
|
||||
GGML_LOG_ERROR("ggml-hex: String pattern not found in driver path.\n");
|
||||
return result;
|
||||
}
|
||||
|
||||
// Replace \SystemRoot with an absolute path from system ENV windir
|
||||
const std::wstring systemRootEnv = L"windir";
|
||||
|
||||
// Query the number of wide charactors this variable requires
|
||||
DWORD numWords = GetEnvironmentVariableW(systemRootEnv.c_str(), NULL, 0);
|
||||
if (numWords == 0) {
|
||||
GGML_LOG_ERROR("ggml-hex: Failed get systemRoot environment variable\n");
|
||||
return result;
|
||||
}
|
||||
|
||||
// Query the actual system root name from environment variable
|
||||
std::vector<wchar_t> systemRoot(numWords + 1);
|
||||
numWords = GetEnvironmentVariableW(systemRootEnv.c_str(), systemRoot.data(), numWords + 1);
|
||||
if (numWords == 0) {
|
||||
GGML_LOG_ERROR("ggml-hex: Failed to read windir environment variable\n");
|
||||
return result;
|
||||
}
|
||||
driverPath.replace(0, systemRootPlaceholder.length(), std::wstring(systemRoot.data()));
|
||||
|
||||
return wstr_to_str(driverPath);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
|
||||
|
||||
int htpdrv_init() {
|
||||
static dl_handle_ptr lib_cdsp_rpc_handle = nullptr;
|
||||
static bool initialized = false;
|
||||
#ifdef _WIN32
|
||||
std::string drv_path = get_driver_path() + "\\" + "libcdsprpc.dll";
|
||||
#else
|
||||
std::string drv_path = "libcdsprpc.so";
|
||||
#endif
|
||||
if (initialized) {
|
||||
GGML_LOG_INFO("ggml-hex: Driver already loaded\n");
|
||||
return AEE_SUCCESS;
|
||||
}
|
||||
GGML_LOG_INFO("ggml-hex: Loading driver %s\n", drv_path.c_str());
|
||||
|
||||
fs::path path{ drv_path.c_str() };
|
||||
dl_handle_ptr handle { dl_load_library(path) };
|
||||
if (!handle) {
|
||||
GGML_LOG_ERROR("ggml-hex: failed to load %s: %s\n", path.u8string().c_str(), dl_error());
|
||||
return AEE_EUNABLETOLOAD;
|
||||
}
|
||||
|
||||
#define dlsym(drv, type, pfn, symbol, ignore) \
|
||||
do { \
|
||||
pfn = (type) dl_get_sym(drv, #symbol); \
|
||||
if (!ignore && nullptr == pfn) { \
|
||||
GGML_LOG_ERROR("ggml-hex: failed to dlsym %s\n", #symbol); \
|
||||
return AEE_EUNABLETOLOAD; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
dlsym(handle.get(), rpcmem_alloc_pfn_t, rpcmem_alloc_pfn, rpcmem_alloc, false);
|
||||
dlsym(handle.get(), rpcmem_alloc2_pfn_t, rpcmem_alloc2_pfn, rpcmem_alloc2, true);
|
||||
dlsym(handle.get(), rpcmem_free_pfn_t, rpcmem_free_pfn, rpcmem_free, false);
|
||||
dlsym(handle.get(), rpcmem_to_fd_pfn_t, rpcmem_to_fd_pfn, rpcmem_to_fd, false);
|
||||
dlsym(handle.get(), fastrpc_mmap_pfn_t, fastrpc_mmap_pfn, fastrpc_mmap, false);
|
||||
dlsym(handle.get(), fastrpc_munmap_pfn_t, fastrpc_munmap_pfn, fastrpc_munmap, false);
|
||||
dlsym(handle.get(), dspqueue_create_pfn_t, dspqueue_create_pfn, dspqueue_create, false);
|
||||
dlsym(handle.get(), dspqueue_close_pfn_t, dspqueue_close_pfn, dspqueue_close, false);
|
||||
dlsym(handle.get(), dspqueue_export_pfn_t, dspqueue_export_pfn, dspqueue_export, false);
|
||||
dlsym(handle.get(), dspqueue_write_pfn_t, dspqueue_write_pfn, dspqueue_write, false);
|
||||
dlsym(handle.get(), dspqueue_read_pfn_t, dspqueue_read_pfn, dspqueue_read, false);
|
||||
dlsym(handle.get(), remote_handle64_open_pfn_t, remote_handle64_open_pfn, remote_handle64_open, false);
|
||||
dlsym(handle.get(), remote_handle64_invoke_pfn_t, remote_handle64_invoke_pfn, remote_handle64_invoke, false);
|
||||
dlsym(handle.get(), remote_handle_control_pfn_t, remote_handle_control_pfn, remote_handle_control, false);
|
||||
dlsym(handle.get(), remote_handle64_control_pfn_t, remote_handle64_control_pfn, remote_handle64_control, false);
|
||||
dlsym(handle.get(), remote_session_control_pfn_t, remote_session_control_pfn, remote_session_control, false);
|
||||
dlsym(handle.get(), remote_handle64_close_pfn_t, remote_handle64_close_pfn, remote_handle64_close, false);
|
||||
|
||||
lib_cdsp_rpc_handle = std::move(handle);
|
||||
initialized = true;
|
||||
|
||||
return AEE_SUCCESS;
|
||||
}
|
||||
|
||||
domain * get_domain(int domain_id) {
|
||||
int i = 0;
|
||||
int size = sizeof(supported_domains) / sizeof(domain);
|
||||
|
||||
for (i = 0; i < size; i++) {
|
||||
if (supported_domains[i].id == domain_id) {
|
||||
return &supported_domains[i];
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int get_hex_arch_ver(int domain, int * arch) {
|
||||
if (!remote_handle_control_pfn) {
|
||||
GGML_LOG_ERROR("ggml-hex: remote_handle_control is not supported on this device\n");
|
||||
return AEE_EUNSUPPORTEDAPI;
|
||||
}
|
||||
|
||||
struct remote_dsp_capability arch_ver;
|
||||
arch_ver.domain = (uint32_t) domain;
|
||||
arch_ver.attribute_ID = ARCH_VER;
|
||||
arch_ver.capability = (uint32_t) 0;
|
||||
|
||||
int err = remote_handle_control(DSPRPC_GET_DSP_INFO, &arch_ver, sizeof(arch_ver));
|
||||
if ((err & 0xff) == (AEE_EUNSUPPORTEDAPI & 0xff)) {
|
||||
GGML_LOG_ERROR("ggml-hex: FastRPC capability API is not supported on this device\n");
|
||||
return AEE_EUNSUPPORTEDAPI;
|
||||
}
|
||||
|
||||
if (err != AEE_SUCCESS) {
|
||||
GGML_LOG_ERROR("ggml-hex: FastRPC capability query failed (err %d)\n", err);
|
||||
return err;
|
||||
}
|
||||
|
||||
switch (arch_ver.capability & 0xff) {
|
||||
case 0x68:
|
||||
*arch = 68;
|
||||
return 0;
|
||||
case 0x69:
|
||||
*arch = 69;
|
||||
return 0;
|
||||
case 0x73:
|
||||
*arch = 73;
|
||||
return 0;
|
||||
case 0x75:
|
||||
*arch = 75;
|
||||
return 0;
|
||||
case 0x79:
|
||||
*arch = 79;
|
||||
return 0;
|
||||
case 0x81:
|
||||
*arch = 81;
|
||||
return 0;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
121
ggml/src/ggml-hexagon/htp-drv.h
Normal file
121
ggml/src/ggml-hexagon/htp-drv.h
Normal file
@@ -0,0 +1,121 @@
|
||||
#pragma once
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
# pragma clang diagnostic ignored "-Wignored-attributes"
|
||||
#endif
|
||||
|
||||
#include <AEEStdErr.h>
|
||||
#include <rpcmem.h>
|
||||
#include <remote.h>
|
||||
#include <dspqueue.h>
|
||||
|
||||
#if defined(_WIN32) && !defined(__MINGW32__)
|
||||
# ifdef GGML_BACKEND_BUILD
|
||||
# define HTPDRV_API __declspec(dllexport) extern
|
||||
# else
|
||||
# define HTPDRV_API __declspec(dllimport) extern
|
||||
# endif
|
||||
#else
|
||||
# define HTPDRV_API __attribute__ ((visibility ("default"))) extern
|
||||
#endif
|
||||
|
||||
/* Offset to differentiate HLOS and Hexagon error codes.
|
||||
Stores the value of AEE_EOFFSET for Hexagon. */
|
||||
#ifndef DSP_OFFSET
|
||||
# define DSP_OFFSET 0x80000400
|
||||
#endif
|
||||
|
||||
/* Errno for connection reset by peer. */
|
||||
#ifndef ECONNRESET
|
||||
# ifdef __hexagon__
|
||||
# define ECONNRESET 104
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/* Abstraction of different OS specific sleep APIs.
|
||||
SLEEP accepts input in seconds. */
|
||||
#ifndef SLEEP
|
||||
# ifdef __hexagon__
|
||||
# define SLEEP(x) \
|
||||
{ /* Do nothing for simulator. */ \
|
||||
}
|
||||
# else
|
||||
# ifdef _WIN32
|
||||
# define SLEEP(x) Sleep(1000 * x) /* Sleep accepts input in milliseconds. */
|
||||
# else
|
||||
# define SLEEP(x) sleep(x) /* sleep accepts input in seconds. */
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/* Include windows specific header files. */
|
||||
#ifdef _WIN32
|
||||
# include <windows.h>
|
||||
# include <sysinfoapi.h>
|
||||
# define _CRT_SECURE_NO_WARNINGS 1
|
||||
# define _WINSOCK_DEPRECATED_NO_WARNINGS 1
|
||||
#endif
|
||||
|
||||
/* Includes and defines for all HLOS except windows */
|
||||
#if !defined(__hexagon__) && !defined(_WIN32)
|
||||
# include "unistd.h"
|
||||
|
||||
# include <sys/time.h>
|
||||
#endif
|
||||
|
||||
/* Includes and defines for Hexagon and all HLOS except Windows. */
|
||||
#if !defined(_WIN32)
|
||||
/* Weak reference to remote symbol for compilation. */
|
||||
# pragma weak remote_session_control
|
||||
# pragma weak remote_handle_control
|
||||
# pragma weak remote_handle64_control
|
||||
# pragma weak fastrpc_mmap
|
||||
# pragma weak fastrpc_munmap
|
||||
# pragma weak rpcmem_alloc2
|
||||
#endif
|
||||
|
||||
#if !defined(_WIN32)
|
||||
# pragma weak remote_system_request
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
# define DSPQUEUE_TIMEOUT DSPQUEUE_TIMEOUT_NONE
|
||||
#else
|
||||
# define DSPQUEUE_TIMEOUT 1000000
|
||||
#endif
|
||||
|
||||
/**
|
||||
* htpdrv_init API: driver interface entry point
|
||||
*
|
||||
* @return Return AEE error codes as defined in Hexagon SDK.
|
||||
*/
|
||||
HTPDRV_API int htpdrv_init(void);
|
||||
|
||||
/**
|
||||
* get_domain API: get domain struct from domain value.
|
||||
*
|
||||
* @param[in] domain value of a domain
|
||||
* @return Returns domain struct of the domain if it is supported or else
|
||||
* returns NULL.
|
||||
*
|
||||
*/
|
||||
HTPDRV_API domain * get_domain(int domain_id);
|
||||
|
||||
/**
|
||||
* get_hex_arch_ver API: query the Hexagon processor architecture version information
|
||||
*
|
||||
* @param[in] domain_id value of a domain
|
||||
* @param[out] Arch version (73, 75, ...)
|
||||
* @return 0 if query is successful.
|
||||
* non-zero if error, return value points to the error.
|
||||
*
|
||||
*/
|
||||
HTPDRV_API int get_hex_arch_ver(int domain, int * arch);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@@ -1,454 +0,0 @@
|
||||
|
||||
#pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
|
||||
#pragma clang diagnostic ignored "-Wmissing-prototypes"
|
||||
#pragma clang diagnostic ignored "-Wsign-compare"
|
||||
|
||||
#define GGML_COMMON_IMPL_C
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-common.h"
|
||||
#include "ggml-hexagon.h"
|
||||
#include "ggml-impl.h"
|
||||
|
||||
#include "htp-utils.h"
|
||||
|
||||
#include <domain.h>
|
||||
#include <remote.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
domain * get_domain(int domain_id) {
|
||||
int i = 0;
|
||||
int size = sizeof(supported_domains) / sizeof(domain);
|
||||
|
||||
for (i = 0; i < size; i++) {
|
||||
if (supported_domains[i].id == domain_id) {
|
||||
return &supported_domains[i];
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
bool is_valid_domain_id(int domain_id, int compute_only) {
|
||||
int i = 0;
|
||||
int size = sizeof(supported_domains) / sizeof(domain);
|
||||
|
||||
if (compute_only) {
|
||||
return is_CDSP(domain_id);
|
||||
}
|
||||
|
||||
for (i = 0; i < size; i++) {
|
||||
if (supported_domains[i].id == domain_id) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
int get_domains_info(char * domain_type, int * num_domains, fastrpc_domain ** domains_info) {
|
||||
int nErr = AEE_SUCCESS;
|
||||
int ss_info = 0;
|
||||
if (domain_type != NULL) {
|
||||
if (strcmp(domain_type, "LPASS") == 0) {
|
||||
ss_info = FASTRPC_LPASS;
|
||||
} else if (strcmp(domain_type, "HPASS") == 0) {
|
||||
ss_info = FASTRPC_HPASS;
|
||||
} else {
|
||||
ss_info = FASTRPC_NSP;
|
||||
}
|
||||
}
|
||||
system_req_payload req = { 0 };
|
||||
req.id = FASTRPC_GET_DOMAINS;
|
||||
req.sys.domains = NULL;
|
||||
fastrpc_domain * domain = NULL;
|
||||
if (ss_info != 0) {
|
||||
req.sys.flags = DOMAINS_LIST_FLAGS_SET_TYPE(req.sys.flags, ss_info);
|
||||
} else {
|
||||
req.sys.flags = 0;
|
||||
}
|
||||
#ifdef _WIN32
|
||||
nErr = AEE_EUNSUPPORTED;
|
||||
goto bail;
|
||||
#endif
|
||||
if (remote_system_request) {
|
||||
nErr = remote_system_request(&req);
|
||||
if (nErr != AEE_SUCCESS) {
|
||||
GGML_LOG_ERROR("Failure in remote_system_request call: %d.\n", nErr);
|
||||
goto bail;
|
||||
}
|
||||
// Allocate memory for domain-info array
|
||||
req.sys.max_domains = req.sys.num_domains;
|
||||
if ((req.sys.domains = calloc(req.sys.num_domains, sizeof(fastrpc_domain))) == NULL) {
|
||||
nErr = AEE_ENOMEMORY;
|
||||
GGML_LOG_ERROR("Unable to allocate memory for req.sys.domains");
|
||||
goto bail;
|
||||
}
|
||||
|
||||
nErr = remote_system_request(&req);
|
||||
if (nErr != AEE_SUCCESS) {
|
||||
GGML_LOG_ERROR("Failure in remote_system_request call: %d.\n", nErr);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
for (int i = 0; i < req.sys.num_domains; i++) {
|
||||
// Verify that only requested type domains were returned
|
||||
domain = &req.sys.domains[i];
|
||||
if (domain->type != ss_info && domain_type != NULL) {
|
||||
nErr = -1;
|
||||
GGML_LOG_ERROR("Incorrect data received from remote_system_request.\n");
|
||||
goto bail;
|
||||
}
|
||||
}
|
||||
*domains_info = req.sys.domains;
|
||||
*num_domains = req.sys.num_domains;
|
||||
} else {
|
||||
nErr = AEE_EUNSUPPORTED;
|
||||
goto bail;
|
||||
}
|
||||
bail:
|
||||
if (nErr && !req.sys.domains) {
|
||||
free(req.sys.domains);
|
||||
}
|
||||
return nErr;
|
||||
}
|
||||
|
||||
int get_effective_domain_id(char * domain_name, int session_id, int * effec_domain_id) {
|
||||
int err = 0;
|
||||
remote_rpc_effective_domain_id_t sess = { 0 };
|
||||
|
||||
sess.domain_name = domain_name;
|
||||
sess.domain_name_len = strlen(domain_name);
|
||||
sess.session_id = session_id;
|
||||
|
||||
err = remote_session_control(FASTRPC_GET_EFFECTIVE_DOMAIN_ID, &sess, sizeof(sess));
|
||||
if (err) {
|
||||
GGML_LOG_ERROR("Error 0x%x: failed to get effective domain id for %s, session id %d\n", err, sess.domain_name,
|
||||
session_id);
|
||||
return err;
|
||||
}
|
||||
|
||||
*effec_domain_id = sess.effective_domain_id;
|
||||
return err;
|
||||
}
|
||||
|
||||
int get_dsp_support(int * domain) {
|
||||
int nErr = AEE_SUCCESS;
|
||||
*domain = CDSP_DOMAIN_ID; // DSP domain default value is CDSP_DOMAIN_ID
|
||||
|
||||
if (remote_handle_control) {
|
||||
struct remote_dsp_capability dsp_capability_domain = { CDSP_DOMAIN_ID, DOMAIN_SUPPORT, 0 };
|
||||
nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability));
|
||||
if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
|
||||
GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
|
||||
goto bail;
|
||||
}
|
||||
|
||||
if (dsp_capability_domain.capability == 0) {
|
||||
dsp_capability_domain.domain = ADSP_DOMAIN_ID; // Check for ADSP support.
|
||||
dsp_capability_domain.attribute_ID = DOMAIN_SUPPORT;
|
||||
dsp_capability_domain.capability = 0;
|
||||
nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain,
|
||||
sizeof(struct remote_dsp_capability));
|
||||
if (dsp_capability_domain.capability) {
|
||||
*domain = ADSP_DOMAIN_ID; // For targets like Agatti (not having cDSP), domain is ADSP_DOMAIN_ID
|
||||
}
|
||||
}
|
||||
|
||||
if (nErr != AEE_SUCCESS) {
|
||||
GGML_LOG_ERROR("\nget_dsp_support failed with Error 0x%x\n", nErr);
|
||||
goto bail;
|
||||
}
|
||||
} else {
|
||||
nErr = AEE_EUNSUPPORTEDAPI;
|
||||
GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
|
||||
}
|
||||
|
||||
bail:
|
||||
return nErr;
|
||||
}
|
||||
|
||||
int get_vtcm_info(int domain, uint32_t * capability, uint32_t attr) {
|
||||
int nErr = AEE_SUCCESS;
|
||||
*capability = 0;
|
||||
|
||||
if (attr == VTCM_PAGE || attr == VTCM_COUNT) {
|
||||
} else {
|
||||
nErr = AEE_EBADPARM;
|
||||
GGML_LOG_ERROR("Unsupported attr. Only VTCM_PAGE and VTCM_COUNT supported\n");
|
||||
goto bail;
|
||||
}
|
||||
if (remote_handle_control) {
|
||||
if (domain == ADSP_DOMAIN_ID || domain == CDSP_DOMAIN_ID) {
|
||||
/*
|
||||
* Query the DSP for VTCM information
|
||||
* Since the ADSP does not have a dedicated VTCM, we expect the output to be 0
|
||||
*/
|
||||
struct remote_dsp_capability dsp_capability_vtcm_dsp;
|
||||
dsp_capability_vtcm_dsp.domain = (uint32_t) domain;
|
||||
dsp_capability_vtcm_dsp.attribute_ID = attr;
|
||||
dsp_capability_vtcm_dsp.capability = (uint32_t) 0;
|
||||
nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_vtcm_dsp,
|
||||
sizeof(struct remote_dsp_capability));
|
||||
if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
|
||||
GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
|
||||
GGML_LOG_ERROR("Running the usecase without checking the capability\n");
|
||||
nErr = AEE_SUCCESS;
|
||||
goto bail;
|
||||
} else if (nErr == AEE_SUCCESS) {
|
||||
*capability = dsp_capability_vtcm_dsp.capability;
|
||||
} else {
|
||||
GGML_LOG_ERROR("\nget_vtcm_info failed with Error 0x%x\n", nErr);
|
||||
goto bail;
|
||||
}
|
||||
} else {
|
||||
nErr = AEE_EUNSUPPORTED;
|
||||
GGML_LOG_ERROR("Unsupported domain %d\n", domain);
|
||||
goto bail;
|
||||
}
|
||||
} else {
|
||||
nErr = AEE_EUNSUPPORTEDAPI;
|
||||
GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
|
||||
}
|
||||
|
||||
bail:
|
||||
return nErr;
|
||||
}
|
||||
|
||||
bool is_unsignedpd_supported(int domain_id) {
|
||||
int nErr = AEE_SUCCESS;
|
||||
if (remote_handle_control) {
|
||||
struct remote_dsp_capability dsp_capability_domain = { domain_id, UNSIGNED_PD_SUPPORT, 0 };
|
||||
nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability));
|
||||
if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
|
||||
GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device. Falling back to signed pd.\n");
|
||||
return false;
|
||||
}
|
||||
if (nErr) {
|
||||
GGML_LOG_ERROR("\nERROR 0x%x: FastRPC Capability API failed. Falling back to signed pd.", nErr);
|
||||
return false;
|
||||
}
|
||||
if (dsp_capability_domain.capability == 1) {
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
nErr = AEE_EUNSUPPORTEDAPI;
|
||||
GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device. Falling back to signed pd.\n");
|
||||
return false;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool get_unsignedpd_support(void) {
|
||||
return is_unsignedpd_supported(CDSP_DOMAIN_ID);
|
||||
}
|
||||
|
||||
bool is_async_fastrpc_supported(int domain) {
|
||||
int nErr = AEE_SUCCESS;
|
||||
if (remote_handle_control) {
|
||||
if (domain == CDSP_DOMAIN_ID) {
|
||||
/*
|
||||
* Query the DSP for ASYNC_FASTRPC_SUPPORT information
|
||||
* Async fastrpc is supported only on CDSP
|
||||
*/
|
||||
struct remote_dsp_capability dsp_capability_async_support;
|
||||
dsp_capability_async_support.domain = (uint32_t) domain;
|
||||
dsp_capability_async_support.attribute_ID = ASYNC_FASTRPC_SUPPORT;
|
||||
dsp_capability_async_support.capability = (uint32_t) 0;
|
||||
nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_async_support,
|
||||
sizeof(struct remote_dsp_capability));
|
||||
if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
|
||||
GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
|
||||
GGML_LOG_ERROR("Running the usecase without checking the capability\n");
|
||||
nErr = AEE_SUCCESS;
|
||||
goto bail;
|
||||
} else if (dsp_capability_async_support.capability == 1) {
|
||||
return true;
|
||||
}
|
||||
if (nErr != AEE_SUCCESS) {
|
||||
GGML_LOG_ERROR("\nis_async_fastrpc_supported failed with Error 0x%x\n", nErr);
|
||||
goto bail;
|
||||
}
|
||||
} else {
|
||||
nErr = AEE_EUNSUPPORTED;
|
||||
GGML_LOG_ERROR("Async fastrpc is not supported on domain %d\n", domain);
|
||||
goto bail;
|
||||
}
|
||||
} else {
|
||||
nErr = AEE_EUNSUPPORTEDAPI;
|
||||
GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
|
||||
}
|
||||
|
||||
bail:
|
||||
return false;
|
||||
}
|
||||
|
||||
bool is_status_notification_supported(int domain) {
|
||||
int nErr = AEE_SUCCESS;
|
||||
|
||||
if (remote_handle_control) {
|
||||
/*
|
||||
* Query the DSP for STATUS_NOTIFICATION_SUPPORT information
|
||||
* DSP User PD status notification Support
|
||||
*/
|
||||
struct remote_dsp_capability dsp_capability_status_notification_support;
|
||||
dsp_capability_status_notification_support.domain = (uint32_t) domain;
|
||||
dsp_capability_status_notification_support.attribute_ID = STATUS_NOTIFICATION_SUPPORT;
|
||||
dsp_capability_status_notification_support.capability = (uint32_t) 0;
|
||||
nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_status_notification_support,
|
||||
sizeof(struct remote_dsp_capability));
|
||||
if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
|
||||
GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
|
||||
GGML_LOG_ERROR("Running the usecase without checking the capability\n");
|
||||
nErr = AEE_SUCCESS;
|
||||
goto bail;
|
||||
} else if (dsp_capability_status_notification_support.capability == 1) {
|
||||
return true;
|
||||
}
|
||||
if (nErr != AEE_SUCCESS) {
|
||||
GGML_LOG_ERROR("\nis_status_notification_supported failed with Error 0x%x\n", nErr);
|
||||
goto bail;
|
||||
}
|
||||
} else {
|
||||
nErr = AEE_EUNSUPPORTEDAPI;
|
||||
GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
|
||||
}
|
||||
|
||||
bail:
|
||||
return false;
|
||||
}
|
||||
|
||||
int get_hmx_support_info(int domain, uint32_t * capability, uint32_t attr) {
|
||||
int nErr = AEE_SUCCESS;
|
||||
*capability = 0;
|
||||
|
||||
if (attr != HMX_SUPPORT_SPATIAL && attr != HMX_SUPPORT_DEPTH) {
|
||||
nErr = AEE_EBADPARM;
|
||||
GGML_LOG_ERROR("Unsupported attr. Only HMX_SUPPORT_SPATIAL and HMX_SUPPORT_DEPTH supported\n");
|
||||
goto bail;
|
||||
}
|
||||
if (remote_handle_control) {
|
||||
if (domain == CDSP_DOMAIN_ID) {
|
||||
/*
|
||||
* Query the DSP for HMX SUPPORT information
|
||||
* HMX is supported on CDSP only
|
||||
*/
|
||||
struct remote_dsp_capability dsp_capability_hmx_dsp;
|
||||
dsp_capability_hmx_dsp.domain = (uint32_t) domain;
|
||||
dsp_capability_hmx_dsp.attribute_ID = attr;
|
||||
dsp_capability_hmx_dsp.capability = (uint32_t) 0;
|
||||
nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hmx_dsp,
|
||||
sizeof(struct remote_dsp_capability));
|
||||
if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
|
||||
GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
|
||||
GGML_LOG_ERROR("Running the usecase without checking the capability\n");
|
||||
nErr = AEE_SUCCESS;
|
||||
goto bail;
|
||||
} else if (nErr == AEE_SUCCESS) {
|
||||
*capability = dsp_capability_hmx_dsp.capability;
|
||||
} else {
|
||||
GGML_LOG_ERROR("\nget_hmx_support_info failed with Error 0x%x\n", nErr);
|
||||
goto bail;
|
||||
}
|
||||
} else {
|
||||
nErr = AEE_EUNSUPPORTED;
|
||||
GGML_LOG_ERROR("HMX support is not there for domain %d\n", domain);
|
||||
goto bail;
|
||||
}
|
||||
} else {
|
||||
nErr = AEE_EUNSUPPORTEDAPI;
|
||||
GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
|
||||
}
|
||||
|
||||
bail:
|
||||
return nErr;
|
||||
}
|
||||
|
||||
int get_hex_arch_ver(int domain, int * arch) {
|
||||
if (!remote_handle_control) {
|
||||
GGML_LOG_ERROR("ggml-hex: remote_handle_control is not supported on this device\n");
|
||||
return AEE_EUNSUPPORTEDAPI;
|
||||
}
|
||||
|
||||
struct remote_dsp_capability arch_ver;
|
||||
arch_ver.domain = (uint32_t) domain;
|
||||
arch_ver.attribute_ID = ARCH_VER;
|
||||
arch_ver.capability = (uint32_t) 0;
|
||||
|
||||
int err = remote_handle_control(DSPRPC_GET_DSP_INFO, &arch_ver, sizeof(arch_ver));
|
||||
if ((err & 0xff) == (AEE_EUNSUPPORTEDAPI & 0xff)) {
|
||||
GGML_LOG_ERROR("ggml-hex: FastRPC capability API is not supported on this device\n");
|
||||
return AEE_EUNSUPPORTEDAPI;
|
||||
}
|
||||
|
||||
if (err != AEE_SUCCESS) {
|
||||
GGML_LOG_ERROR("ggml-hex: FastRPC capability query failed (err %d)\n", err);
|
||||
return err;
|
||||
}
|
||||
|
||||
switch (arch_ver.capability & 0xff) {
|
||||
case 0x68:
|
||||
*arch = 68;
|
||||
return 0;
|
||||
case 0x69:
|
||||
*arch = 69;
|
||||
return 0;
|
||||
case 0x73:
|
||||
*arch = 73;
|
||||
return 0;
|
||||
case 0x75:
|
||||
*arch = 75;
|
||||
return 0;
|
||||
case 0x79:
|
||||
*arch = 79;
|
||||
return 0;
|
||||
case 0x81:
|
||||
*arch = 81;
|
||||
return 0;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
int get_hvx_support_info(int domain, uint32_t * capability, uint32_t attr) {
|
||||
int nErr = AEE_SUCCESS;
|
||||
*capability = 0;
|
||||
|
||||
if (remote_handle_control) {
|
||||
if (domain == CDSP_DOMAIN_ID) {
|
||||
/*
|
||||
* Query the DSP for HVX SUPPORT information
|
||||
* HVX is supported on CDSP only
|
||||
*/
|
||||
struct remote_dsp_capability dsp_capability_hvx_dsp;
|
||||
dsp_capability_hvx_dsp.domain = (uint32_t) domain;
|
||||
dsp_capability_hvx_dsp.attribute_ID = attr;
|
||||
dsp_capability_hvx_dsp.capability = (uint32_t) 0;
|
||||
nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hvx_dsp,
|
||||
sizeof(struct remote_dsp_capability));
|
||||
if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
|
||||
GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
|
||||
GGML_LOG_ERROR("Running the usecase without checking the capability\n");
|
||||
nErr = AEE_SUCCESS;
|
||||
goto bail;
|
||||
} else if (nErr == AEE_SUCCESS) {
|
||||
*capability = dsp_capability_hvx_dsp.capability;
|
||||
} else {
|
||||
GGML_LOG_ERROR("\nget_hvx_support_info failed with Error 0x%x\n", nErr);
|
||||
goto bail;
|
||||
}
|
||||
} else {
|
||||
nErr = AEE_EUNSUPPORTED;
|
||||
GGML_LOG_ERROR("HVX support is not available on domain %d\n", domain);
|
||||
goto bail;
|
||||
}
|
||||
} else {
|
||||
nErr = AEE_EUNSUPPORTEDAPI;
|
||||
GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
|
||||
}
|
||||
|
||||
bail:
|
||||
return nErr;
|
||||
}
|
||||
@@ -1,221 +0,0 @@
|
||||
#ifndef HTP_UTILS_H
|
||||
#define HTP_UTILS_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <AEEStdErr.h>
|
||||
#include <inttypes.h>
|
||||
#include <remote.h>
|
||||
#include <rpcmem.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
/* Offset to differentiate HLOS and Hexagon error codes.
|
||||
Stores the value of AEE_EOFFSET for Hexagon. */
|
||||
#ifndef DSP_OFFSET
|
||||
# define DSP_OFFSET 0x80000400
|
||||
#endif
|
||||
|
||||
/* Errno for connection reset by peer. */
|
||||
#ifndef ECONNRESET
|
||||
# ifdef __hexagon__
|
||||
# define ECONNRESET 104
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/* Abstraction of different OS specific sleep APIs.
|
||||
SLEEP accepts input in seconds. */
|
||||
#ifndef SLEEP
|
||||
# ifdef __hexagon__
|
||||
# define SLEEP(x) \
|
||||
{ /* Do nothing for simulator. */ \
|
||||
}
|
||||
# else
|
||||
# ifdef _WINDOWS
|
||||
# define SLEEP(x) Sleep(1000 * x) /* Sleep accepts input in milliseconds. */
|
||||
# else
|
||||
# define SLEEP(x) sleep(x) /* sleep accepts input in seconds. */
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/* Include windows specific header files. */
|
||||
#ifdef _WINDOWS
|
||||
# include <sysinfoapi.h>
|
||||
# include <windows.h>
|
||||
# define _CRT_SECURE_NO_WARNINGS 1
|
||||
# define _WINSOCK_DEPRECATED_NO_WARNINGS 1
|
||||
/* Including this file for custom implementation of getopt function. */
|
||||
# include "getopt_custom.h"
|
||||
#endif
|
||||
|
||||
/* Includes and defines for all HLOS except windows */
|
||||
#if !defined(__hexagon__) && !defined(_WINDOWS)
|
||||
# include "unistd.h"
|
||||
|
||||
# include <sys/time.h>
|
||||
#endif
|
||||
|
||||
/* Includes and defines for Hexagon and all HLOS except Windows. */
|
||||
#if !defined(_WINDOWS)
|
||||
/* Weak reference to remote symbol for compilation. */
|
||||
# pragma weak remote_session_control
|
||||
# pragma weak remote_handle_control
|
||||
# pragma weak remote_handle64_control
|
||||
# pragma weak fastrpc_mmap
|
||||
# pragma weak fastrpc_munmap
|
||||
# pragma weak rpcmem_alloc2
|
||||
#endif
|
||||
|
||||
#if !defined(_WINDOWS)
|
||||
# pragma weak remote_system_request
|
||||
#endif
|
||||
/**
|
||||
* Wrapper for FastRPC Capability API: query DSP support.
|
||||
*
|
||||
* @param[out] domain pointer to supported domain.
|
||||
* @return 0 if query is successful.
|
||||
* non-zero if error, return value points to the error.
|
||||
*/
|
||||
int get_dsp_support(int * domain);
|
||||
|
||||
/**
|
||||
* Wrapper for FastRPC Capability API: query VTCM information.
|
||||
*
|
||||
* @param[in] domain value of domain in the queried.
|
||||
* @param[out] capability capability value of the attribute queried.
|
||||
* @param[in] attr value of the attribute to the queried.
|
||||
* @return 0 if query is successful.
|
||||
* non-zero if error, return value points to the error.
|
||||
*/
|
||||
int get_vtcm_info(int domain, uint32_t * capability, uint32_t attr);
|
||||
|
||||
/**
|
||||
* Wrapper for FastRPC Capability API: query unsigned pd support on CDSP domain.
|
||||
*
|
||||
* @return true if unsigned pd is supported.
|
||||
* false if unsigned pd is not supported, capability query failed.
|
||||
*/
|
||||
|
||||
bool get_unsignedpd_support(void);
|
||||
|
||||
/**
|
||||
* Wrapper for FastRPC Capability API: query unsigned pd support.
|
||||
*
|
||||
* @param[in] domain value of domain in the queried.
|
||||
* @return true if unsigned pd is supported.
|
||||
* false if unsigned pd is not supported, capability query failed.
|
||||
*/
|
||||
|
||||
bool is_unsignedpd_supported(int domain_id);
|
||||
|
||||
/**
|
||||
* is_valid_domain_id API: query a domain id is valid.
|
||||
*
|
||||
* @param[in] domain value of domain in the queried.
|
||||
* @param[in] compute_only value of domain is only compared with CDSP domains supported by the target when enabled.
|
||||
* @return true if value of domain is valid.
|
||||
* false if value of domain is not valid.
|
||||
*/
|
||||
|
||||
bool is_valid_domain_id(int domain_id, int compute_only);
|
||||
|
||||
/**
|
||||
* get_domain API: get domain struct from domain value.
|
||||
*
|
||||
* @param[in] domain value of a domain
|
||||
* @return Returns domain struct of the domain if it is supported or else
|
||||
* returns NULL.
|
||||
*
|
||||
*/
|
||||
|
||||
domain * get_domain(int domain_id);
|
||||
|
||||
/**
|
||||
* get_domains_info API: get information for all the domains available on the device
|
||||
*
|
||||
* @param[in] domain_type pointer to domain type
|
||||
* @param[in] num_domains pointer to number of domains
|
||||
* @param[in] domains_info pointer to save discovered domains information.
|
||||
* @return 0 if query is successful.
|
||||
* non-zero if error, return value points to the error.
|
||||
*
|
||||
* It is user's responsibility to free the memory used to store the domains info whose address is present in domains_info before closing the application.
|
||||
*
|
||||
*/
|
||||
|
||||
int get_domains_info(char * domain_type, int * num_domains, fastrpc_domain ** domains_info);
|
||||
|
||||
/**
|
||||
* get_effective_domain_id API: get effective domain id for given session id
|
||||
*
|
||||
* @param[in] domain_name pointer to domain name
|
||||
* @param[in] session_id
|
||||
* @param[in] effec_domain_id pointer to save obtained effective domain id.
|
||||
* @return 0 if query is successful.
|
||||
* non-zero if error, return value points to the error.
|
||||
*
|
||||
*/
|
||||
|
||||
int get_effective_domain_id(char * domain_name, int session_id, int * effec_domain_id);
|
||||
|
||||
/**
|
||||
* is_async_fastrpc_supported API: query a domain id has async fastrpc supported or not
|
||||
*
|
||||
* @param[in] domain_id value of a domain
|
||||
* @return Returns true or false stating support of Async FastRPC
|
||||
*
|
||||
*/
|
||||
|
||||
bool is_async_fastrpc_supported(int domain_id);
|
||||
|
||||
/**
|
||||
* is_status_notification_supported API: query the DSP for STATUS_NOTIFICATION_SUPPORT information
|
||||
*
|
||||
* @param[in] domain_id value of a domain
|
||||
* @return Returns true or false stating status notification support information
|
||||
*
|
||||
*/
|
||||
bool is_status_notification_supported(int domain_id);
|
||||
|
||||
/**
|
||||
* get_hmx_support_info API: query the DSP for HMX SUPPORT information
|
||||
*
|
||||
* @param[in] domain_id value of a domain
|
||||
* @param[out] capability capability value of the attribute queried.
|
||||
* @param[in] attr value of the attribute to the queried.
|
||||
* @return 0 if query is successful.
|
||||
* non-zero if error, return value points to the error.
|
||||
*
|
||||
*/
|
||||
int get_hmx_support_info(int domain, uint32_t * capability, uint32_t attr);
|
||||
|
||||
/**
|
||||
* get_hex_arch_ver API: query the Hexagon processor architecture version information
|
||||
*
|
||||
* @param[in] domain_id value of a domain
|
||||
* @param[out] Arch version (73, 75, ...)
|
||||
* @return 0 if query is successful.
|
||||
* non-zero if error, return value points to the error.
|
||||
*
|
||||
*/
|
||||
int get_hex_arch_ver(int domain, int * arch);
|
||||
|
||||
/**
|
||||
* get_hvx_support_info API: query the DSP for HVX SUPPORT information
|
||||
*
|
||||
* @param[in] domain_id value of a domain
|
||||
* @param[out] capability capability value of the attribute queried.
|
||||
* @param[in] attr value of the attribute to the queried.
|
||||
* @return 0 if query is successful.
|
||||
* non-zero if error, return value points to the error.
|
||||
*
|
||||
*/
|
||||
int get_hvx_support_info(int domain, uint32_t * capability, uint32_t attr);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif //DSP_CAPABILITIES_UTILS_H
|
||||
79
ggml/src/ggml-hexagon/libdl.h
Normal file
79
ggml/src/ggml-hexagon/libdl.h
Normal file
@@ -0,0 +1,79 @@
|
||||
#pragma once
|
||||
|
||||
#ifdef _WIN32
|
||||
# define WIN32_LEAN_AND_MEAN
|
||||
# ifndef NOMINMAX
|
||||
# define NOMINMAX
|
||||
# endif
|
||||
# include <windows.h>
|
||||
# include <winevt.h>
|
||||
#else
|
||||
# include <dlfcn.h>
|
||||
# include <unistd.h>
|
||||
#endif
|
||||
#include <filesystem>
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
using dl_handle = std::remove_pointer_t<HMODULE>;
|
||||
|
||||
struct dl_handle_deleter {
|
||||
void operator()(HMODULE handle) {
|
||||
FreeLibrary(handle);
|
||||
}
|
||||
};
|
||||
|
||||
static inline dl_handle * dl_load_library(const fs::path & path) {
|
||||
// suppress error dialogs for missing DLLs
|
||||
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
|
||||
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
|
||||
|
||||
HMODULE handle = LoadLibraryW(path.wstring().c_str());
|
||||
|
||||
SetErrorMode(old_mode);
|
||||
|
||||
return handle;
|
||||
}
|
||||
|
||||
static inline void * dl_get_sym(dl_handle * handle, const char * name) {
|
||||
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
|
||||
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
|
||||
|
||||
void * p = (void *) GetProcAddress(handle, name);
|
||||
|
||||
SetErrorMode(old_mode);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
static inline const char * dl_error() {
|
||||
return "";
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
using dl_handle = void;
|
||||
|
||||
struct dl_handle_deleter {
|
||||
void operator()(void * handle) {
|
||||
dlclose(handle);
|
||||
}
|
||||
};
|
||||
|
||||
static inline dl_handle * dl_load_library(const fs::path & path) {
|
||||
dl_handle * handle = dlopen(path.string().c_str(), RTLD_NOW | RTLD_LOCAL);
|
||||
return handle;
|
||||
}
|
||||
|
||||
static inline void * dl_get_sym(dl_handle * handle, const char * name) {
|
||||
return dlsym(handle, name);
|
||||
}
|
||||
|
||||
static inline const char * dl_error() {
|
||||
const char *rslt = dlerror();
|
||||
return rslt != nullptr ? rslt : "";
|
||||
}
|
||||
|
||||
#endif
|
||||
38
ggml/src/ggml-hexagon/libggml-htp.inf
Normal file
38
ggml/src/ggml-hexagon/libggml-htp.inf
Normal file
@@ -0,0 +1,38 @@
|
||||
[Version]
|
||||
Signature = "$WINDOWS NT$"
|
||||
Class = ComputeAccelerator
|
||||
ClassGuid = {F01A9D53-3FF6-48D2-9F97-C8A7004BE10C}
|
||||
Provider = %GGML%
|
||||
DriverVer = 01/01/2026,1.0.0.0
|
||||
CatalogFile = libggml-htp.cat
|
||||
PnpLockDown = 1
|
||||
|
||||
[DestinationDirs]
|
||||
Drivers_Dir = 6
|
||||
|
||||
[SourceDisksNames]
|
||||
1 = %DiskId%
|
||||
|
||||
[SourceDisksFiles]
|
||||
libggml-htp-v68.so = 1
|
||||
libggml-htp-v69.so = 1
|
||||
libggml-htp-v73.so = 1
|
||||
libggml-htp-v75.so = 1
|
||||
libggml-htp-v81.so = 1
|
||||
|
||||
[ControlFlags]
|
||||
ExcludeFromSelect = *
|
||||
|
||||
[DefaultInstall.NTarm64]
|
||||
CopyFiles=Drivers_Dir
|
||||
|
||||
[Drivers_Dir]
|
||||
libggml-htp-v68.so,,,0x10 ;COPYFLG_NO_OVERWRITE
|
||||
libggml-htp-v69.so,,,0x10 ;COPYFLG_NO_OVERWRITE
|
||||
libggml-htp-v73.so,,,0x10 ;COPYFLG_NO_OVERWRITE
|
||||
libggml-htp-v75.so,,,0x10 ;COPYFLG_NO_OVERWRITE
|
||||
libggml-htp-v81.so,,,0x10 ;COPYFLG_NO_OVERWRITE
|
||||
|
||||
[Strings]
|
||||
GGML = 'GGML'
|
||||
DiskId = 'GGML HTP library'
|
||||
@@ -62,6 +62,8 @@ file(GLOB SRCS "../ggml-cuda/template-instances/fattn-mma*.cu")
|
||||
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
||||
file(GLOB SRCS "../ggml-cuda/template-instances/mmq*.cu")
|
||||
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
||||
file(GLOB SRCS "../ggml-cuda/template-instances/mmf*.cu")
|
||||
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
||||
|
||||
if (GGML_CUDA_FA_ALL_QUANTS)
|
||||
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*.cu")
|
||||
|
||||
@@ -101,6 +101,8 @@ set(GGML_OPENCL_KERNELS
|
||||
mul_mm_f32_f32_l4_lm
|
||||
mul_mm_f16_f32_l4_lm
|
||||
mul_mm_q8_0_f32_l4_lm
|
||||
mul_mm_q8_0_f32_8x4
|
||||
gemv_noshuffle_general_q8_0_f32
|
||||
mul
|
||||
norm
|
||||
relu
|
||||
|
||||
@@ -226,7 +226,8 @@ static ADRENO_GPU_GEN get_adreno_gpu_gen(const char *device_name) {
|
||||
return ADRENO_GPU_GEN::A7X;
|
||||
}
|
||||
|
||||
if (strstr(device_name, "830")) {
|
||||
if (strstr(device_name, "830") ||
|
||||
strstr(device_name, "840")) {
|
||||
return ADRENO_GPU_GEN::A8X;
|
||||
}
|
||||
|
||||
@@ -529,7 +530,7 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
|
||||
cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
|
||||
cl_kernel kernel_convert_block_mxfp4, kernel_convert_block_mxfp4_trans, kernel_restore_block_mxfp4, kernel_restore_block_mxfp4_trans;
|
||||
cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0;
|
||||
cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0, kernel_restore_block_q8_0_trans;
|
||||
cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
|
||||
cl_kernel kernel_convert_block_q4_0_noshuffle;
|
||||
cl_kernel kernel_restore_block_q4_0_noshuffle;
|
||||
@@ -696,6 +697,8 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096;
|
||||
cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096;
|
||||
cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096;
|
||||
cl_kernel kernel_mul_mm_q8_0_f32_8x4;
|
||||
cl_kernel CL_mul_mat_vec_q8_0_f32;
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
void free() {
|
||||
@@ -894,6 +897,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_mxfp4", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q8_0", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q8_0_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0_trans", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_K", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q6_K", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
@@ -2290,6 +2294,46 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// mul_mm_q8_0_f32_8x4
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src_q8_8x4_gemm {
|
||||
#include "mul_mm_q8_0_f32_8x4.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src_q8_8x4_gemm = read_file("mul_mm_q8_0_f32_8x4.cl");
|
||||
#endif
|
||||
backend_ctx->program_CL_gemm = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_q8_8x4_gemm.c_str(), compile_opts);
|
||||
CL_CHECK((backend_ctx->kernel_mul_mm_q8_0_f32_8x4 = clCreateKernel(backend_ctx->program_CL_gemm, "kernel_mul_mm_q8_0_f32_8x4", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// gemv_noshuffle_general_q8_0_f32
|
||||
{
|
||||
std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
||||
" -cl-mad-enable "
|
||||
" -DSIMDGROUP_WIDTH=" +
|
||||
std::to_string(backend_ctx->adreno_wave_size);
|
||||
if (backend_ctx->has_vector_subgroup_broadcast) {
|
||||
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
||||
}
|
||||
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src_CL_gemv_general {
|
||||
#include "gemv_noshuffle_general_q8_0_f32.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src_CL_gemv_general = read_file("gemv_noshuffle_general_q8_0_f32.cl");
|
||||
#endif
|
||||
|
||||
cl_program prog = build_program_from_source(
|
||||
backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv_general.c_str(), CL_gemv_compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->CL_mul_mat_vec_q8_0_f32 = clCreateKernel(prog, "kernel_gemv_noshuffle", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
std::string CL_moe_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
||||
" -cl-mad-enable "
|
||||
" -cl-fast-relaxed-math";
|
||||
@@ -3745,6 +3789,15 @@ inline bool use_adreno_moe_kernels(const ggml_backend_opencl_context *backend_ct
|
||||
return ((strstr(tensor->name, "ffn") != NULL) || (strstr(tensor->name, "as") != NULL)) && (ne01 % 64 == 0);
|
||||
}
|
||||
|
||||
inline bool enable_adreno_trans_weight(const ggml_backend_opencl_context *backend_ctx, const ggml_tensor *tensor) {
|
||||
|
||||
bool adreno_kernel = use_adreno_kernels(backend_ctx, tensor);
|
||||
|
||||
size_t elem_num = tensor->ne[0] * tensor->ne[1] * tensor->ne[2] * tensor->ne[3];
|
||||
|
||||
return ((elem_num < 128 * 1024 * 1024) && adreno_kernel); // max element num: 2**27
|
||||
}
|
||||
|
||||
static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device);
|
||||
|
||||
@@ -4159,6 +4212,130 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
|
||||
tensor->extra = extra;
|
||||
|
||||
// Transpose the weights and scales
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
if (enable_adreno_trans_weight(backend_ctx, tensor)) {
|
||||
|
||||
int M = tensor->ne[1]; // ne01
|
||||
int K = tensor->ne[0]; // ne00
|
||||
|
||||
GGML_ASSERT(K % 32 == 0);
|
||||
GGML_ASSERT(M % 4 == 0);
|
||||
GGML_ASSERT(tensor->ne[2] == 1);
|
||||
GGML_ASSERT(tensor->ne[3] == 1);
|
||||
|
||||
// Transpose weights
|
||||
size_t q_size_bytes = K * M / 4 * sizeof(float);
|
||||
cl_buffer_region region;
|
||||
region.origin = 0;
|
||||
region.size = q_size_bytes;
|
||||
cl_mem qT_d = clCreateSubBuffer(
|
||||
backend_ctx->prealloc_quant_trans.buffer,
|
||||
0,
|
||||
CL_BUFFER_CREATE_TYPE_REGION,
|
||||
®ion,
|
||||
&err);
|
||||
CL_CHECK(err);
|
||||
|
||||
cl_mem q_d_image1D;
|
||||
cl_mem qT_d_image1D;
|
||||
|
||||
cl_image_format img_fmt_1d;
|
||||
cl_image_desc img_desc_1d;
|
||||
|
||||
img_fmt_1d = { CL_RGBA, CL_FLOAT };
|
||||
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
||||
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc_1d.image_width = M * K / 4 / 4;
|
||||
img_desc_1d.buffer = extra->q;
|
||||
q_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
img_fmt_1d = { CL_RGBA, CL_FLOAT };
|
||||
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
||||
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc_1d.image_width = M * K / 4 / 4;
|
||||
img_desc_1d.buffer = qT_d;
|
||||
qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
int height_q = M / 4;
|
||||
int width_q = K / 4 / 4;
|
||||
kernel = backend_ctx->kernel_transpose_32;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_d_image1D));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &qT_d_image1D));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_q));
|
||||
|
||||
size_t local_size_q[3] = {4, 16, 1};
|
||||
size_t global_size_q[3] = {static_cast<size_t>(width_q), static_cast<size_t>(height_q), 1};
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_q, local_size_q, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
|
||||
// Transpose scales
|
||||
size_t d_size_bytes = M * (K / 32) * 2;
|
||||
region.origin = 0;
|
||||
region.size = d_size_bytes;
|
||||
cl_mem dT_d = clCreateSubBuffer(
|
||||
backend_ctx->prealloc_scales_trans.buffer,
|
||||
0,
|
||||
CL_BUFFER_CREATE_TYPE_REGION,
|
||||
®ion,
|
||||
&err);
|
||||
CL_CHECK(err);
|
||||
|
||||
cl_mem d_d_image1D;
|
||||
cl_mem dT_d_image1D;
|
||||
|
||||
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
||||
img_fmt_1d = { CL_R, CL_HALF_FLOAT };
|
||||
img_desc_1d.image_width = M * K / 32;
|
||||
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc_1d.buffer = extra->d;
|
||||
d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
|
||||
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
||||
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc_1d.image_width = M * K / 32 / 4;
|
||||
img_desc_1d.buffer = dT_d;
|
||||
dT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
int height_s = M / 4;
|
||||
int width_s = K / 32;
|
||||
|
||||
kernel = backend_ctx->kernel_transpose_16_4x1;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &dT_d_image1D));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_s));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_s));
|
||||
|
||||
size_t local_size_s[3] = {4, 16, 1};
|
||||
size_t global_size_s[3] = {static_cast<size_t>(width_s), static_cast<size_t>(height_s), 1};
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_s, local_size_s, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
|
||||
// copy transposed buffer contents to original buffers
|
||||
CL_CHECK(clEnqueueCopyBuffer(queue, qT_d, extra->q, 0, 0, q_size_bytes, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
|
||||
CL_CHECK(clEnqueueCopyBuffer(queue, dT_d, extra->d, 0, 0, d_size_bytes, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
|
||||
CL_CHECK(clReleaseMemObject(qT_d));
|
||||
CL_CHECK(clReleaseMemObject(dT_d));
|
||||
|
||||
CL_CHECK(clReleaseMemObject(q_d_image1D));
|
||||
CL_CHECK(clReleaseMemObject(d_d_image1D));
|
||||
CL_CHECK(clReleaseMemObject(qT_d_image1D));
|
||||
CL_CHECK(clReleaseMemObject(dT_d_image1D));
|
||||
} // end transpose
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
return;
|
||||
}
|
||||
if (tensor->type == GGML_TYPE_Q6_K) {
|
||||
@@ -4448,6 +4625,36 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
ggml_nbytes(tensor), NULL, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
if (enable_adreno_trans_weight(backend_ctx, tensor)) {
|
||||
cl_kernel kernel = backend_ctx->kernel_restore_block_q8_0_trans;
|
||||
|
||||
int ne00 = tensor->ne[0];
|
||||
int ne01 = tensor->ne[1];
|
||||
GGML_ASSERT(tensor->ne[2] == 1); // ???
|
||||
GGML_ASSERT(tensor->ne[3] == 1); // ???
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_int), &ne01));
|
||||
|
||||
size_t global_work_size[3] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), 1, 1};
|
||||
size_t local_work_size[3] = {64, 1, 1};
|
||||
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
|
||||
global_work_size, local_work_size, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
|
||||
CL_CHECK(clEnqueueReadBuffer(
|
||||
queue, data_device, CL_TRUE, offset,
|
||||
size, data, 0, NULL, NULL));
|
||||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
cl_kernel kernel = backend_ctx->kernel_restore_block_q8_0;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d));
|
||||
@@ -7947,6 +8154,252 @@ static void ggml_cl_mul_mat_kq_kqv_adreno(ggml_backend_t backend, const ggml_ten
|
||||
CL_CHECK(clReleaseMemObject(D_sub_buffer));
|
||||
}
|
||||
|
||||
static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(src1);
|
||||
GGML_ASSERT(src1->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
const enum ggml_type src0t = src0->type;
|
||||
const enum ggml_type src1t = src1->type;
|
||||
|
||||
GGML_ASSERT(src0t == GGML_TYPE_Q8_0);
|
||||
GGML_ASSERT(src1t == GGML_TYPE_F32);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra;
|
||||
|
||||
GGML_ASSERT(src1->view_offs == 0);
|
||||
GGML_ASSERT(dst->view_offs == 0);
|
||||
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
const int ne02 = src0->ne[2];
|
||||
|
||||
const int ne10 = src1->ne[0];
|
||||
const int ne12 = src1->ne[2];
|
||||
|
||||
const int ne0 = dst->ne[0];
|
||||
const int ne1 = dst->ne[1];
|
||||
|
||||
GGML_ASSERT(ne00 == ne10);
|
||||
GGML_ASSERT((ne00 % 32) == 0);
|
||||
GGML_ASSERT(ne0 == ne01);
|
||||
|
||||
cl_context context = backend_ctx->context;
|
||||
cl_kernel kernel;
|
||||
|
||||
// init CL objects
|
||||
cl_int status;
|
||||
cl_image_format img_fmt_1d;
|
||||
cl_image_desc img_desc_1d;
|
||||
cl_buffer_region region;
|
||||
cl_mem A_image1d;
|
||||
cl_mem B_image1d;
|
||||
cl_mem B_sub_buffer;
|
||||
cl_mem S_image1d;
|
||||
|
||||
cl_mem D_image1d;
|
||||
cl_mem D_sub_buffer;
|
||||
|
||||
int M = ne01;
|
||||
int N = ne1;
|
||||
int K = ne00;
|
||||
|
||||
// create an image for A
|
||||
img_fmt_1d = { CL_R, CL_FLOAT};
|
||||
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
||||
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc_1d.image_width = M * K / 4; // Divide by 4 for char -> float
|
||||
img_desc_1d.buffer = extra0_q8_0->q;
|
||||
A_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
// create an image for Scale
|
||||
img_fmt_1d = { CL_R, CL_HALF_FLOAT};
|
||||
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
||||
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc_1d.image_width = M * K / 32; // Block size is 32
|
||||
img_desc_1d.buffer = extra0_q8_0->d;
|
||||
S_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
// create a sub_buffer for B
|
||||
region.origin = (extra1->offset); // + src1->view_offs);
|
||||
region.size = K * N * sizeof(float);
|
||||
B_sub_buffer = clCreateSubBuffer((extra1->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
// create an image for B from sub_buffer: RGBA (OCL)
|
||||
img_fmt_1d = {CL_RGBA, CL_FLOAT};
|
||||
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
||||
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc_1d.image_width = K * N / 4;
|
||||
img_desc_1d.buffer = B_sub_buffer;
|
||||
B_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
// Create subbuffer and image1d_buffer for dst
|
||||
region.origin = (extrad->offset); // + dst->view_offs;
|
||||
region.size = M * N * sizeof(float);
|
||||
D_sub_buffer = clCreateSubBuffer((extrad->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
img_fmt_1d = {CL_R, CL_FLOAT};
|
||||
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
||||
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc_1d.image_width = M * N;
|
||||
img_desc_1d.buffer = D_sub_buffer;
|
||||
D_image1d = clCreateImage(context, CL_MEM_WRITE_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
size_t local_work_size[3] = {1, 1, 1};
|
||||
size_t global_work_size[3] = {1, 1, 1};
|
||||
|
||||
if (N == 1) {
|
||||
kernel = backend_ctx->CL_mul_mat_vec_q8_0_f32;
|
||||
|
||||
int r2 = 1;
|
||||
int r3 = 1;
|
||||
cl_uint k_arg = 0;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &A_image1d));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extra0_q8_0->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &B_image1d));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_ulong), &extra1->offset));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_ulong), &extrad->offset));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne10));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &r2));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &r3));
|
||||
|
||||
size_t wavesize = backend_ctx->adreno_wave_size;
|
||||
local_work_size[0] = wavesize;
|
||||
local_work_size[1] = 4; // reduce factor
|
||||
local_work_size[2] = 1;
|
||||
|
||||
global_work_size[0] = ((M + wavesize - 1) / wavesize) * wavesize;
|
||||
global_work_size[1] = 4; // reduce factor
|
||||
global_work_size[2] = 1;
|
||||
} else {
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
cl_mem B_image1d_trans = nullptr;
|
||||
// for B transpose
|
||||
cl_mem B_d = nullptr;
|
||||
int padding;
|
||||
|
||||
//how many extra elements beyond multiple of 8
|
||||
int extra_elements = N % 8;
|
||||
|
||||
//how much padding to add
|
||||
padding = 0;
|
||||
if (extra_elements > 0){
|
||||
padding = 8 - extra_elements;
|
||||
}
|
||||
|
||||
// Specify the starting offset (in bytes)
|
||||
region.origin = 0;
|
||||
// Specify the size of the sub-buffer (divide by 2 for FP16)
|
||||
region.size = K * (N + padding) * sizeof(float)/2;
|
||||
backend_ctx->prealloc_act_trans.allocate(context, region.size);
|
||||
B_d = clCreateSubBuffer(
|
||||
backend_ctx->prealloc_act_trans.buffer,
|
||||
0,
|
||||
CL_BUFFER_CREATE_TYPE_REGION,
|
||||
®ion,
|
||||
&status);
|
||||
CL_CHECK(status);
|
||||
|
||||
cl_image_format image_format_B_d_output = { CL_RGBA, CL_HALF_FLOAT }; //(CL_HALF_FLOAT for FP16)
|
||||
cl_image_desc image_desc_B_d_output = {
|
||||
CL_MEM_OBJECT_IMAGE1D_BUFFER,
|
||||
static_cast<size_t>(K * (N + padding)/4),
|
||||
0, 0, 0, 0, 0, 0, 0, { B_d }
|
||||
};
|
||||
B_image1d_trans = clCreateImage(
|
||||
context,
|
||||
0,
|
||||
&image_format_B_d_output,
|
||||
&image_desc_B_d_output,
|
||||
NULL,
|
||||
&status);
|
||||
CL_CHECK(status);
|
||||
|
||||
int height_B = N/4;
|
||||
if (height_B == 0) {
|
||||
height_B = 1;
|
||||
}
|
||||
int width_B = K/4;
|
||||
int padded_height_B = (N + padding)/4;
|
||||
|
||||
kernel = backend_ctx->kernel_transpose_32_16;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &B_image1d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &B_image1d_trans));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_B));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_B));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &padded_height_B));
|
||||
|
||||
size_t local_size_t[2] = { 1, 16 };
|
||||
size_t global_size_t[2] = {
|
||||
static_cast<size_t>(width_B),
|
||||
static_cast<size_t>(padded_height_B)
|
||||
};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_size_t, local_size_t, dst);
|
||||
|
||||
kernel = backend_ctx->kernel_mul_mm_q8_0_f32_8x4;
|
||||
|
||||
int N_with_padding = N + padding;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q8_0->q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q8_0->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &B_image1d_trans));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &K));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &M));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &N_with_padding));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &N));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &offsetd));
|
||||
|
||||
global_work_size[0] = (size_t)(N + 7) / 8;
|
||||
global_work_size[1] = (size_t)(M + 3) / 4;
|
||||
global_work_size[2] = 1;
|
||||
|
||||
local_work_size[0] = 2;
|
||||
local_work_size[1] = 128;
|
||||
local_work_size[2] = 1;
|
||||
}
|
||||
|
||||
// enqueue kernel with profiling
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
|
||||
// deallocate sub buffers and images
|
||||
CL_CHECK(clReleaseMemObject(A_image1d));
|
||||
CL_CHECK(clReleaseMemObject(B_sub_buffer));
|
||||
CL_CHECK(clReleaseMemObject(B_image1d));
|
||||
CL_CHECK(clReleaseMemObject(S_image1d));
|
||||
CL_CHECK(clReleaseMemObject(D_sub_buffer));
|
||||
CL_CHECK(clReleaseMemObject(D_image1d));
|
||||
#else
|
||||
GGML_UNUSED(src0);
|
||||
GGML_UNUSED(src1);
|
||||
GGML_UNUSED(dst);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
@@ -8064,6 +8517,13 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
||||
int padding;
|
||||
// <--------------------------------------------> //
|
||||
|
||||
// q8_0 x fp32
|
||||
if (src0t == GGML_TYPE_Q8_0 && src1t == GGML_TYPE_F32 &&
|
||||
enable_adreno_trans_weight(backend_ctx, src0)) {
|
||||
ggml_cl_mul_mat_q8_0_f32_adreno(backend, src0, src1, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
// q4_0 x fp32
|
||||
if(src0t == GGML_TYPE_Q4_0 && src1t == GGML_TYPE_F32) {
|
||||
// TODO: remove duplicate definitions of image description + format -- move to top
|
||||
|
||||
@@ -274,6 +274,37 @@ kernel void kernel_restore_block_q8_0(
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_restore_block_q8_0_trans(
|
||||
global uchar * src_q,
|
||||
global half * src_d,
|
||||
global block_q8_0 * dst,
|
||||
uint ne00,
|
||||
uint ne01
|
||||
){
|
||||
uint num_blk_per_row = ne00 / QK8_0;
|
||||
|
||||
global block_q8_0 * b = (global block_q8_0 *) dst + get_global_id(0) * num_blk_per_row;
|
||||
global uchar * q = (global uchar *) src_q + get_global_id(0) * 4; // 4 8-bit packed
|
||||
global half * d = (global half *) src_d + get_global_id(0);
|
||||
|
||||
for (uint blk = 0; blk < num_blk_per_row; blk++) {
|
||||
b->d = *d;
|
||||
|
||||
for (uint i = 0; i < QK8_0; i+=4) {
|
||||
b->qs[i] = q[0];
|
||||
b->qs[i+1] = q[1];
|
||||
b->qs[i+2] = q[2];
|
||||
b->qs[i+3] = q[3];
|
||||
|
||||
q += 4 * ne01; // M stride
|
||||
}
|
||||
|
||||
d += ne01;
|
||||
|
||||
b++;
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// kernel_convert_block_q6_K
|
||||
// Convert the block_q6_K format to 3 separate arrays (AOS -> SOA).
|
||||
|
||||
195
ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl
Normal file
195
ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl
Normal file
@@ -0,0 +1,195 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
|
||||
#ifdef cl_qcom_reqd_sub_group_size
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#endif
|
||||
|
||||
#define QK8_0 32
|
||||
#define N_SIMDGROUP 4
|
||||
|
||||
#define dequantizeBlockAccum_ns_sgbroadcast_1(total_sums, bits8, scale, y) \
|
||||
float shared_y; \
|
||||
char elem; \
|
||||
\
|
||||
shared_y = sub_group_broadcast(y.s0, 0); \
|
||||
elem = (char)(bits8.s0 & 0x000000FF); \
|
||||
total_sums += convert_int(elem) * scale * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 0); \
|
||||
elem = (char)((bits8.s0 & 0x0000FF00) >> 8); \
|
||||
total_sums += convert_int(elem) * scale * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 0); \
|
||||
elem = (char)((bits8.s0 & 0x00FF0000) >> 16); \
|
||||
total_sums += convert_int(elem) * scale * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 0); \
|
||||
elem = (char)((bits8.s0 & 0xFF000000) >> 24); \
|
||||
total_sums += convert_int(elem) * scale * shared_y; \
|
||||
\
|
||||
shared_y = sub_group_broadcast(y.s4, 0); \
|
||||
elem = (char)(bits8.s1 & 0x000000FF); \
|
||||
total_sums += convert_int(elem) * scale * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 0); \
|
||||
elem = (char)((bits8.s1 & 0x0000FF00) >> 8); \
|
||||
total_sums += convert_int(elem) * scale * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 0); \
|
||||
elem = (char)((bits8.s1 & 0x00FF0000) >> 16); \
|
||||
total_sums += convert_int(elem) * scale * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 0); \
|
||||
elem = (char)((bits8.s1 & 0xFF000000) >> 24); \
|
||||
total_sums += convert_int(elem) * scale * shared_y; \
|
||||
\
|
||||
shared_y = sub_group_broadcast(y.s0, 1); \
|
||||
elem = (char)(bits8.s2 & 0x000000FF); \
|
||||
total_sums += convert_int(elem) * scale * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 1); \
|
||||
elem = (char)((bits8.s2 & 0x0000FF00) >> 8); \
|
||||
total_sums += convert_int(elem) * scale * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 1); \
|
||||
elem = (char)((bits8.s2 & 0x00FF0000) >> 16); \
|
||||
total_sums += convert_int(elem) * scale * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 1); \
|
||||
elem = (char)((bits8.s2 & 0xFF000000) >> 24); \
|
||||
total_sums += convert_int(elem) * scale * shared_y; \
|
||||
\
|
||||
shared_y = sub_group_broadcast(y.s4, 1); \
|
||||
elem = (char)(bits8.s3 & 0x000000FF); \
|
||||
total_sums += convert_int(elem) * scale * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 1); \
|
||||
elem = (char)((bits8.s3 & 0x0000FF00) >> 8); \
|
||||
total_sums += convert_int(elem) * scale * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 1); \
|
||||
elem = (char)((bits8.s3 & 0x00FF0000) >> 16); \
|
||||
total_sums += convert_int(elem) * scale * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 1); \
|
||||
elem = (char)((bits8.s3 & 0xFF000000) >> 24); \
|
||||
total_sums += convert_int(elem) * scale * shared_y; \
|
||||
\
|
||||
shared_y = sub_group_broadcast(y.s0, 2); \
|
||||
elem = (char)(bits8.s4 & 0x000000FF); \
|
||||
total_sums += convert_int(elem) * scale * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 2); \
|
||||
elem = (char)((bits8.s4 & 0x0000FF00) >> 8); \
|
||||
total_sums += convert_int(elem) * scale * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 2); \
|
||||
elem = (char)((bits8.s4 & 0x00FF0000) >> 16); \
|
||||
total_sums += convert_int(elem) * scale * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 2); \
|
||||
elem = (char)((bits8.s4 & 0xFF000000) >> 24); \
|
||||
total_sums += convert_int(elem) * scale * shared_y; \
|
||||
\
|
||||
shared_y = sub_group_broadcast(y.s4, 2); \
|
||||
elem = (char)(bits8.s5 & 0x000000FF); \
|
||||
total_sums += convert_int(elem) * scale * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 2); \
|
||||
elem = (char)((bits8.s5 & 0x0000FF00) >> 8); \
|
||||
total_sums += convert_int(elem) * scale * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 2); \
|
||||
elem = (char)((bits8.s5 & 0x00FF0000) >> 16); \
|
||||
total_sums += convert_int(elem) * scale * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 2); \
|
||||
elem = (char)((bits8.s5 & 0xFF000000) >> 24); \
|
||||
total_sums += convert_int(elem) * scale * shared_y; \
|
||||
\
|
||||
shared_y = sub_group_broadcast(y.s0, 3); \
|
||||
elem = (char)(bits8.s6 & 0x000000FF); \
|
||||
total_sums += convert_int(elem) * scale * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 3); \
|
||||
elem = (char)((bits8.s6 & 0x0000FF00) >> 8); \
|
||||
total_sums += convert_int(elem) * scale * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 3); \
|
||||
elem = (char)((bits8.s6 & 0x00FF0000) >> 16); \
|
||||
total_sums += convert_int(elem) * scale * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 3); \
|
||||
elem = (char)((bits8.s6 & 0xFF000000) >> 24); \
|
||||
total_sums += convert_int(elem) * scale * shared_y; \
|
||||
\
|
||||
shared_y = sub_group_broadcast(y.s4, 3); \
|
||||
elem = (char)(bits8.s7 & 0x000000FF); \
|
||||
total_sums += convert_int(elem) * scale * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 3); \
|
||||
elem = (char)((bits8.s7 & 0x0000FF00) >> 8); \
|
||||
total_sums += convert_int(elem) * scale * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 3); \
|
||||
elem = (char)((bits8.s7 & 0x00FF0000) >> 16); \
|
||||
total_sums += convert_int(elem) * scale * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 3); \
|
||||
elem = (char)((bits8.s7 & 0xFF000000) >> 24); \
|
||||
total_sums += convert_int(elem) * scale * shared_y; \
|
||||
|
||||
#ifdef ADRENO_GPU
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
__kernel void kernel_gemv_noshuffle(
|
||||
__read_only image1d_buffer_t src0_q, // quantized A
|
||||
global half * src0_d, // A scales
|
||||
__read_only image1d_buffer_t src1, // B
|
||||
ulong offset1, // offset to B (0)
|
||||
global float * dst, // C
|
||||
ulong offsetd, // offset to C
|
||||
int ne00, // K
|
||||
int ne01, // M
|
||||
int ne02, // 1
|
||||
int ne10, // K
|
||||
int ne12, // 1
|
||||
int ne0, // M
|
||||
int ne1, // N
|
||||
int r2, // 1
|
||||
int r3)
|
||||
{
|
||||
uint groupId = get_local_id(1);
|
||||
uint gid = get_global_id(0);
|
||||
ushort slid = get_sub_group_local_id();
|
||||
|
||||
uint K = ne00;
|
||||
uint M = ne01;
|
||||
|
||||
uint LINE_STRIDE_A = M;
|
||||
uint BLOCK_STRIDE_A = 8 * M; // 32 / 4 = 8
|
||||
|
||||
__private uint8 regA;
|
||||
__private half regS;
|
||||
__private float8 regB;
|
||||
|
||||
__private float totalSum = (float)(0.0f);
|
||||
|
||||
// loop along K in block granularity, skip 4 blocks every iter
|
||||
#pragma unroll 1 /* tell compiler not to unroll */
|
||||
for (uint k = groupId; k < (K / QK8_0); k += N_SIMDGROUP) {
|
||||
regS = src0_d[gid + k * LINE_STRIDE_A]; // each fiber loads scale of one rows
|
||||
// first 4 fibers in each wave load 8 B values to its private scope
|
||||
if (slid < 4) {
|
||||
regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
|
||||
regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
|
||||
}
|
||||
|
||||
// load weights for one block in consecutive rows
|
||||
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
|
||||
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
|
||||
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
|
||||
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
|
||||
regA.s4 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
|
||||
regA.s5 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
|
||||
regA.s6 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
|
||||
regA.s7 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
|
||||
|
||||
dequantizeBlockAccum_ns_sgbroadcast_1(totalSum, regA, regS, regB);
|
||||
}
|
||||
|
||||
// reduction in local memory, assumes #wave=4
|
||||
__local float reduceLM[SIMDGROUP_WIDTH * 3];
|
||||
if (groupId == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = totalSum;
|
||||
if (groupId == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = totalSum;
|
||||
if (groupId == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = totalSum;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
|
||||
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
|
||||
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
|
||||
|
||||
// 1 outputs per fiber in wave 0
|
||||
if (groupId == 0) {
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
dst[gid] = totalSum;
|
||||
}
|
||||
}
|
||||
129
ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl
Normal file
129
ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl
Normal file
@@ -0,0 +1,129 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
|
||||
#ifdef cl_qcom_reqd_sub_group_size
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
|
||||
#ifdef ADRENO_GPU
|
||||
REQD_SUBGROUP_SIZE_128
|
||||
#endif
|
||||
|
||||
kernel void kernel_mul_mm_q8_0_f32_8x4(
|
||||
global const uint * src0_q,
|
||||
global const half * src0_d,
|
||||
__read_only image1d_buffer_t src1,
|
||||
global float * dst,
|
||||
int k,
|
||||
int m,
|
||||
int n,
|
||||
int n_no_padding,
|
||||
ulong offsetd
|
||||
) {
|
||||
|
||||
int m_4 = m >> 2;
|
||||
int n_4 = n >> 2;
|
||||
|
||||
int gy = get_global_id(0);
|
||||
int gx = get_global_id(1);
|
||||
int gx_2 = gx << 2;
|
||||
dst = (global float *)((global char*)dst + offsetd);
|
||||
|
||||
|
||||
half8 c0 = 0, c1 = 0, c2 = 0, c3 = 0;
|
||||
half8 B;
|
||||
half4 deq;
|
||||
|
||||
__global const uint* wptr = src0_q + gx_2;
|
||||
__global const half* sptr = src0_d + gx_2;
|
||||
|
||||
for (int i = 0; i < k; i += 4) {
|
||||
uint4 pack4 = vload4(0, wptr + (i / 4) * m);
|
||||
half4 scale = vload4(0, sptr + (i / 32) * m);
|
||||
|
||||
char4 p0 = as_char4(pack4.s0);
|
||||
char4 p1 = as_char4(pack4.s1);
|
||||
char4 p2 = as_char4(pack4.s2);
|
||||
char4 p3 = as_char4(pack4.s3);
|
||||
|
||||
// ------------------- j = 0 (k = i+0) -------------------
|
||||
B.s0123 = read_imageh(src1, gy * 2 + (i + 0) * n_4);
|
||||
B.s4567 = read_imageh(src1, gy * 2 + (i + 0) * n_4 + 1);
|
||||
|
||||
half4 wj0 = convert_half4((char4)(p0.s0, p1.s0, p2.s0, p3.s0)) * scale;
|
||||
|
||||
c0 += B * wj0.s0;
|
||||
c1 += B * wj0.s1;
|
||||
c2 += B * wj0.s2;
|
||||
c3 += B * wj0.s3;
|
||||
|
||||
// ------------------- j = 1 (k = i+1) -------------------
|
||||
B.s0123 = read_imageh(src1, gy * 2 + (i + 1) * n_4);
|
||||
B.s4567 = read_imageh(src1, gy * 2 + (i + 1) * n_4 + 1);
|
||||
|
||||
half4 wj1 = convert_half4((char4)(p0.s1, p1.s1, p2.s1, p3.s1)) * scale;
|
||||
|
||||
c0 += B * wj1.s0;
|
||||
c1 += B * wj1.s1;
|
||||
c2 += B * wj1.s2;
|
||||
c3 += B * wj1.s3;
|
||||
|
||||
// ------------------- j = 2 (k = i+2) -------------------
|
||||
B.s0123 = read_imageh(src1, gy * 2 + (i + 2) * n_4);
|
||||
B.s4567 = read_imageh(src1, gy * 2 + (i + 2) * n_4 + 1);
|
||||
|
||||
half4 wj2 = convert_half4((char4)(p0.s2, p1.s2, p2.s2, p3.s2)) * scale;
|
||||
|
||||
c0 += B * wj2.s0;
|
||||
c1 += B * wj2.s1;
|
||||
c2 += B * wj2.s2;
|
||||
c3 += B * wj2.s3;
|
||||
|
||||
// ------------------- j = 3 (k = i+3) -------------------
|
||||
B.s0123 = read_imageh(src1, gy * 2 + (i + 3) * n_4);
|
||||
B.s4567 = read_imageh(src1, gy * 2 + (i + 3) * n_4 + 1);
|
||||
|
||||
half4 wj3 = convert_half4((char4)(p0.s3, p1.s3, p2.s3, p3.s3)) * scale;
|
||||
|
||||
c0 += B * wj3.s0;
|
||||
c1 += B * wj3.s1;
|
||||
c2 += B * wj3.s2;
|
||||
c3 += B * wj3.s3;
|
||||
}
|
||||
|
||||
int idx = (gy << 3) * m + (gx << 2);
|
||||
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + idx);
|
||||
}
|
||||
}
|
||||
@@ -123,6 +123,15 @@ static __dpct_inline__ T op_log(T x) {
|
||||
return sycl::log(x);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static __dpct_inline__ T op_softplus(T x) {
|
||||
const float xf = (float) x;
|
||||
const float ax = sycl::fabs(xf);
|
||||
const float m = sycl::fmax(xf, 0.0f);
|
||||
const float y = m + sycl::log1p(sycl::exp(-ax));
|
||||
return (T) y;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static __dpct_inline__ T op_neg(T x) {
|
||||
return -x;
|
||||
@@ -695,6 +704,12 @@ static inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor
|
||||
});
|
||||
}
|
||||
|
||||
static inline void ggml_sycl_op_softplus(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
|
||||
return op_softplus(x);
|
||||
});
|
||||
}
|
||||
|
||||
static inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
|
||||
return op_neg(x);
|
||||
@@ -1101,6 +1116,11 @@ void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
ggml_sycl_op_log(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_sycl_softplus(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_softplus(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_sycl_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_neg(ctx, dst);
|
||||
|
||||
@@ -61,6 +61,8 @@ void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_softplus(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
@@ -2263,6 +2263,65 @@ inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_ten
|
||||
diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
|
||||
}
|
||||
|
||||
static void tri_f32_sycl(
|
||||
const float * src,
|
||||
float * dst,
|
||||
const int64_t ne0,
|
||||
const int64_t ne1,
|
||||
const int64_t ne2,
|
||||
const int64_t ne3,
|
||||
const ggml_tri_type ttype,
|
||||
dpct::queue_ptr main_stream
|
||||
) {
|
||||
const size_t total = (size_t) ne0 * (size_t) ne1 * (size_t) ne2 * (size_t) ne3;
|
||||
|
||||
main_stream->parallel_for(sycl::range<1>(total), [=](sycl::id<1> tid) {
|
||||
const int64_t idx = (int64_t) tid[0];
|
||||
|
||||
const int64_t i0 = idx % ne0;
|
||||
const int64_t t1 = idx / ne0;
|
||||
const int64_t i1 = t1 % ne1;
|
||||
|
||||
bool keep = false;
|
||||
switch (ttype) {
|
||||
case GGML_TRI_TYPE_LOWER: keep = (i0 < i1); break;
|
||||
case GGML_TRI_TYPE_LOWER_DIAG: keep = (i0 <= i1); break;
|
||||
case GGML_TRI_TYPE_UPPER: keep = (i0 > i1); break;
|
||||
case GGML_TRI_TYPE_UPPER_DIAG: keep = (i0 >= i1); break;
|
||||
default: keep = false; break;
|
||||
}
|
||||
|
||||
dst[idx] = keep ? src[idx] : 0.0f;
|
||||
});
|
||||
}
|
||||
|
||||
static void ggml_sycl_op_tri(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
GGML_ASSERT(src0);
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
||||
|
||||
dpct::queue_ptr main_stream = ctx.stream();
|
||||
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
||||
|
||||
const float * src0_dd = static_cast<const float *>(src0->data);
|
||||
float * dst_dd = static_cast<float *>(dst->data);
|
||||
|
||||
const ggml_tri_type ttype = (ggml_tri_type) ggml_get_op_params_i32(dst, 0);
|
||||
|
||||
const int64_t ne0 = src0->ne[0];
|
||||
const int64_t ne1 = src0->ne[1];
|
||||
const int64_t ne2 = src0->ne[2];
|
||||
const int64_t ne3 = src0->ne[3];
|
||||
|
||||
tri_f32_sycl(src0_dd, dst_dd, ne0, ne1, ne2, ne3, ttype, main_stream);
|
||||
}
|
||||
|
||||
|
||||
inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
@@ -3786,6 +3845,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
|
||||
case GGML_UNARY_OP_EXP:
|
||||
ggml_sycl_exp(ctx, dst);
|
||||
break;
|
||||
case GGML_UNARY_OP_SOFTPLUS:
|
||||
ggml_sycl_softplus(ctx, dst);
|
||||
break;
|
||||
case GGML_UNARY_OP_SGN:
|
||||
ggml_sycl_sgn(ctx, dst);
|
||||
break;
|
||||
@@ -3912,6 +3974,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
|
||||
case GGML_OP_TRANSPOSE:
|
||||
GGML_SYCL_DEBUG("%s: Tensor NO-OP\n", __func__);
|
||||
break;
|
||||
case GGML_OP_TRI:
|
||||
ggml_sycl_op_tri(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
ggml_sycl_diag_mask_inf(ctx, dst);
|
||||
break;
|
||||
@@ -4404,6 +4469,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
case GGML_UNARY_OP_EXP:
|
||||
case GGML_UNARY_OP_SOFTPLUS:
|
||||
case GGML_UNARY_OP_ELU:
|
||||
return true;
|
||||
case GGML_UNARY_OP_FLOOR:
|
||||
@@ -4616,6 +4682,13 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
return true;
|
||||
case GGML_OP_CONT:
|
||||
return op->src[0]->type != GGML_TYPE_BF16;
|
||||
case GGML_OP_TRI:
|
||||
{
|
||||
const ggml_tensor * src0 = op->src[0];
|
||||
return src0 &&
|
||||
op->type == GGML_TYPE_F32 &&
|
||||
ggml_is_contiguous(src0);
|
||||
}
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
return true;
|
||||
case GGML_OP_SOFT_MAX:
|
||||
|
||||
@@ -11956,7 +11956,8 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
||||
}
|
||||
}
|
||||
if (mmq) {
|
||||
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_quantize_q8_1, num_it);
|
||||
vk_pipeline pipeline_quantize_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1);
|
||||
ggml_pipeline_request_descriptor_sets(ctx, pipeline_quantize_q8_1, num_it);
|
||||
}
|
||||
|
||||
ggml_pipeline_allocate_descriptor_sets(ctx);
|
||||
|
||||
@@ -114,7 +114,7 @@ struct Params {
|
||||
#define PARAMS_BINDING 4
|
||||
#endif
|
||||
|
||||
@group(0) @binding(DST_BINDING) var<storage, read_write> dst: array<f32>;
|
||||
@group(0) @binding(DST_BINDING) var<storage, read_write> dst: array<vec4<f32>>;
|
||||
@group(0) @binding(PARAMS_BINDING) var<uniform> params: Params;
|
||||
|
||||
// Just a very small float value.
|
||||
@@ -160,14 +160,21 @@ fn calc_softmax_term(kv_idx: u32, q_tile_row: u32, slope: f32) -> f32 {
|
||||
return v;
|
||||
}
|
||||
|
||||
fn load_f32x4(buf: ptr<storage, array<vec4<f32>>, read_write>, scalar_index: u32) -> vec4<f32> {
|
||||
return (*buf)[scalar_index >> 2u];
|
||||
}
|
||||
|
||||
fn load_kvx4(buf: ptr<storage, array<vec4<KV_TYPE>>, read_write>, scalar_index: u32) -> vec4<KV_TYPE> {
|
||||
return (*buf)[scalar_index >> 2u];
|
||||
}
|
||||
|
||||
@compute @workgroup_size(WG_SIZE)
|
||||
fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
|
||||
@builtin(local_invocation_id) local_id: vec3<u32>,
|
||||
@builtin(subgroup_id) subgroup_id: u32,
|
||||
@builtin(subgroup_size) subgroup_size: u32,
|
||||
@builtin(num_subgroups) num_subgroups: u32,
|
||||
@builtin(subgroup_invocation_id) sg_inv_id: u32) {
|
||||
@builtin(local_invocation_id) local_id: vec3<u32>,
|
||||
@builtin(subgroup_id) subgroup_id: u32,
|
||||
@builtin(subgroup_size) subgroup_size: u32,
|
||||
@builtin(num_subgroups) num_subgroups: u32,
|
||||
@builtin(subgroup_invocation_id) sg_inv_id: u32) {
|
||||
|
||||
// initialize row max for online softmax
|
||||
for (var i = local_id.x; i < Q_TILE; i += WG_SIZE) {
|
||||
@@ -231,9 +238,9 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
|
||||
|
||||
for (var kv_tile = 0u; kv_tile < params.seq_len_kv; kv_tile += KV_TILE) {
|
||||
// clear inter_shmem to ensure zero-initialized accumulators
|
||||
for (var elem_idx = local_id.x; elem_idx < Q_TILE * KV_TILE; elem_idx += WG_SIZE) {
|
||||
inter_shmem[elem_idx] = 0.0;
|
||||
}
|
||||
for (var elem_idx = local_id.x; elem_idx < Q_TILE * KV_TILE; elem_idx += WG_SIZE) {
|
||||
inter_shmem[elem_idx] = 0.0;
|
||||
}
|
||||
|
||||
// load k tile into shared memory
|
||||
#if defined(KV_Q4_0)
|
||||
@@ -309,48 +316,77 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
|
||||
|
||||
// accumulate q block * k block into registers across the entire KV tile
|
||||
// TODO: this loop seems to be the current largest bottleneck
|
||||
for (var kv_block = subgroup_id; kv_block < KV_BLOCKS; kv_block += num_subgroups) {
|
||||
let inter_offset = kv_block * SG_MAT_N;
|
||||
var acc: subgroup_matrix_result<f16, SG_MAT_M, SG_MAT_N> = subgroupMatrixLoad<
|
||||
subgroup_matrix_result<f16, SG_MAT_M, SG_MAT_N>>(&inter_shmem, inter_offset, false, KV_TILE);
|
||||
// this bracket exists to scope the lifetime of variables, reducing register pressure
|
||||
{
|
||||
#ifdef KV_DIRECT
|
||||
let k_block_row = kv_tile + kv_block * SG_MAT_N;
|
||||
let k_global_offset = k_head_offset + k_block_row * params.stride_k1;
|
||||
let k_block_row = kv_tile + subgroup_id * SG_MAT_N;
|
||||
var k_global_offset = k_head_offset + k_block_row * params.stride_k1;
|
||||
#else
|
||||
let k_block_offset = kv_block * SG_MAT_N * HEAD_DIM_QK;
|
||||
var k_block_offset = subgroup_id * SG_MAT_N * HEAD_DIM_QK;
|
||||
#endif
|
||||
for (var head_dim_block = 0u; head_dim_block < HEAD_DIM_QK; head_dim_block += SG_MAT_K) {
|
||||
// load q submatrix from shared memory
|
||||
var q_sg_mat: subgroup_matrix_left<f16, SG_MAT_M, SG_MAT_K> = subgroupMatrixLoad<subgroup_matrix_left<f16, SG_MAT_M, SG_MAT_K>>(
|
||||
&q_shmem,
|
||||
head_dim_block,
|
||||
false,
|
||||
HEAD_DIM_QK
|
||||
);
|
||||
for (var kv_block = subgroup_id; kv_block < KV_BLOCKS; kv_block += num_subgroups) {
|
||||
let inter_offset = kv_block * SG_MAT_N;
|
||||
var acc: subgroup_matrix_result<f16, SG_MAT_M, SG_MAT_N> = subgroupMatrixLoad<subgroup_matrix_result<f16, SG_MAT_M, SG_MAT_N>>(&inter_shmem, inter_offset, false, KV_TILE);
|
||||
|
||||
var q_cur = subgroupMatrixLoad<subgroup_matrix_left<f16, SG_MAT_M, SG_MAT_K>>(&q_shmem, 0u, false, HEAD_DIM_QK);
|
||||
|
||||
// load k submatrix from device or shared memory
|
||||
#ifdef KV_DIRECT
|
||||
var k_sg_mat: subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N> = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(
|
||||
&K,
|
||||
k_global_offset + head_dim_block,
|
||||
true,
|
||||
params.stride_k1
|
||||
);
|
||||
var k_cur = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(&K, k_global_offset + 0u, true, params.stride_k1);
|
||||
#else
|
||||
var k_sg_mat: subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N> = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(
|
||||
&kv_shmem,
|
||||
k_block_offset + head_dim_block,
|
||||
true,
|
||||
HEAD_DIM_QK
|
||||
);
|
||||
var k_cur = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(&kv_shmem, k_block_offset + 0u, true, HEAD_DIM_QK);
|
||||
#endif
|
||||
acc = subgroupMatrixMultiplyAccumulate(q_sg_mat, k_sg_mat, acc);
|
||||
|
||||
var t: u32 = 1u;
|
||||
for (; t + 1u < HEAD_DIM_QK / SG_MAT_K; t += 2u) {
|
||||
let h0 = t * SG_MAT_K;
|
||||
var q0 = subgroupMatrixLoad<subgroup_matrix_left<f16, SG_MAT_M, SG_MAT_K>>(&q_shmem, h0, false, HEAD_DIM_QK);
|
||||
#ifdef KV_DIRECT
|
||||
var k0 = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(&K, k_global_offset + h0, true, params.stride_k1);
|
||||
#else
|
||||
var k0 = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(&kv_shmem, k_block_offset + h0, true, HEAD_DIM_QK);
|
||||
#endif
|
||||
acc = subgroupMatrixMultiplyAccumulate(q_cur, k_cur, acc);
|
||||
q_cur = q0;
|
||||
k_cur = k0;
|
||||
|
||||
let h1 = (t + 1u) * SG_MAT_K;
|
||||
var q1g = subgroupMatrixLoad<subgroup_matrix_left<f16, SG_MAT_M, SG_MAT_K>>(&q_shmem, h1, false, HEAD_DIM_QK);
|
||||
#ifdef KV_DIRECT
|
||||
var k1g = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(&K, k_global_offset + h1, true, params.stride_k1);
|
||||
#else
|
||||
var k1g = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(&kv_shmem, k_block_offset + h1, true, HEAD_DIM_QK);
|
||||
#endif
|
||||
acc = subgroupMatrixMultiplyAccumulate(q_cur, k_cur, acc);
|
||||
q_cur = q1g;
|
||||
k_cur = k1g;
|
||||
}
|
||||
|
||||
// handle odd tail
|
||||
if (t < HEAD_DIM_QK / SG_MAT_K) {
|
||||
let h = t * SG_MAT_K;
|
||||
var qn = subgroupMatrixLoad<subgroup_matrix_left<f16, SG_MAT_M, SG_MAT_K>>(&q_shmem, h, false, HEAD_DIM_QK);
|
||||
#ifdef KV_DIRECT
|
||||
var kn = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(&K, k_global_offset + h, true, params.stride_k1);
|
||||
#else
|
||||
var kn = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(&kv_shmem, k_block_offset + h, true, HEAD_DIM_QK);
|
||||
#endif
|
||||
acc = subgroupMatrixMultiplyAccumulate(q_cur, k_cur, acc);
|
||||
q_cur = qn;
|
||||
k_cur = kn;
|
||||
}
|
||||
|
||||
acc = subgroupMatrixMultiplyAccumulate(q_cur, k_cur, acc);
|
||||
|
||||
#ifdef KV_DIRECT
|
||||
k_global_offset += num_subgroups * SG_MAT_N * params.stride_k1;
|
||||
#else
|
||||
k_block_offset += num_subgroups * SG_MAT_N * HEAD_DIM_QK;
|
||||
#endif
|
||||
subgroupMatrixStore(&inter_shmem, inter_offset, acc, false, KV_TILE);
|
||||
}
|
||||
|
||||
// store acc to shared memory for softmax (S matrix from paper)
|
||||
subgroupMatrixStore(&inter_shmem, inter_offset, acc, false, KV_TILE);
|
||||
}
|
||||
|
||||
|
||||
#ifdef MASK
|
||||
// load mask tile into shared memory for this KV block
|
||||
// TODO: optimize and skip if mask is -INF for the entire tile
|
||||
@@ -495,7 +531,6 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
|
||||
false,
|
||||
HEAD_DIM_V
|
||||
);
|
||||
|
||||
for (var kv_block = 0u; kv_block < KV_BLOCKS; kv_block++) {
|
||||
let p_offset = kv_block * SG_MAT_N;
|
||||
var p_sg_mat: subgroup_matrix_left<f16, SG_MAT_M, SG_MAT_K> = subgroupMatrixLoad<subgroup_matrix_left<f16, SG_MAT_M, SG_MAT_K>>(
|
||||
@@ -527,11 +562,9 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
|
||||
// O += P * V
|
||||
o_sg_mat = subgroupMatrixMultiplyAccumulate(p_sg_mat, v_sg_mat, o_sg_mat);
|
||||
}
|
||||
|
||||
// store O back to shared memory
|
||||
subgroupMatrixStore(&o_shmem, head_dim_block, o_sg_mat, false, HEAD_DIM_V);
|
||||
}
|
||||
|
||||
workgroupBarrier();
|
||||
}
|
||||
|
||||
@@ -566,26 +599,38 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
|
||||
o_shmem[idx] = f16(val);
|
||||
}
|
||||
}
|
||||
|
||||
workgroupBarrier();
|
||||
#endif
|
||||
|
||||
// write output back to global memory
|
||||
for (var q_tile_row = subgroup_id;
|
||||
q_tile_row < Q_TILE;
|
||||
q_tile_row += num_subgroups) {
|
||||
let global_q_row = q_row_start + q_tile_row;
|
||||
if (global_q_row >= params.seq_len_q) {
|
||||
break;
|
||||
}
|
||||
q_tile_row < Q_TILE;
|
||||
q_tile_row += num_subgroups) {
|
||||
|
||||
let exp_sum = exp_sum_shmem[q_tile_row];
|
||||
let scale = select(0.0, 1.0 / exp_sum, exp_sum != 0);
|
||||
let global_q_row = q_row_start + q_tile_row;
|
||||
if (global_q_row >= params.seq_len_q) { break; }
|
||||
|
||||
for (var elem_idx = sg_inv_id; elem_idx < HEAD_DIM_V; elem_idx += subgroup_size) {
|
||||
let o_val = o_shmem[q_tile_row * HEAD_DIM_V + elem_idx];
|
||||
let scaled = f32(o_val) * scale;
|
||||
dst[dst_global_offset + q_tile_row * dst2_stride + elem_idx] = scaled;
|
||||
}
|
||||
let exp_sum = exp_sum_shmem[q_tile_row];
|
||||
let scale = select(0.0, 1.0 / exp_sum, exp_sum != 0.0);
|
||||
|
||||
let row_base: u32 = dst_global_offset + q_tile_row * dst2_stride;
|
||||
|
||||
for (var elem_base = sg_inv_id * 4u;
|
||||
elem_base < HEAD_DIM_V;
|
||||
elem_base += subgroup_size * 4u) {
|
||||
|
||||
let i0 = q_tile_row * HEAD_DIM_V + (elem_base + 0u);
|
||||
let i1 = q_tile_row * HEAD_DIM_V + (elem_base + 1u);
|
||||
let i2 = q_tile_row * HEAD_DIM_V + (elem_base + 2u);
|
||||
let i3 = q_tile_row * HEAD_DIM_V + (elem_base + 3u);
|
||||
|
||||
let v = vec4<f32>(
|
||||
f32(o_shmem[i0]) * scale,
|
||||
f32(o_shmem[i1]) * scale,
|
||||
f32(o_shmem[i2]) * scale,
|
||||
f32(o_shmem[i3]) * scale
|
||||
);
|
||||
|
||||
let dst_vec_index: u32 = (row_base + elem_base) >> 2u;
|
||||
dst[dst_vec_index] = v;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "zendnnl.hpp"
|
||||
|
||||
#include <cstring>
|
||||
@@ -122,8 +121,8 @@ static void ggml_zendnn_compute_forward_mul_mat(
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
ggml_type const vec_dot_type = ggml_get_type_traits_cpu(src0->type)->vec_dot_type;
|
||||
ggml_from_float_t const from_float = ggml_get_type_traits_cpu(vec_dot_type)->from_float;
|
||||
ggml_type const vec_dot_type = src0->type;
|
||||
ggml_from_float_t const from_float = ggml_get_type_traits(vec_dot_type)->from_float_ref;
|
||||
|
||||
GGML_ASSERT(ne0 == ne01);
|
||||
GGML_ASSERT(ne1 == ne11);
|
||||
|
||||
156
models/templates/upstage-Solar-Open-100B.jinja
Normal file
156
models/templates/upstage-Solar-Open-100B.jinja
Normal file
@@ -0,0 +1,156 @@
|
||||
{#- ======== Template Parameters ======== #}
|
||||
{%- set add_generation_prompt = add_generation_prompt if add_generation_prompt is defined else true %}
|
||||
{%- set default_system_prompt = default_system_prompt if default_system_prompt is defined else true %}
|
||||
{%- set reasoning_effort = reasoning_effort if reasoning_effort is defined else "high" %}
|
||||
{%- set think_render_option = think_render_option if think_render_option is defined else "lastthink" %}
|
||||
|
||||
{#- ======== System Block State ======== #}
|
||||
{%- set sys_ns = namespace(is_first_block=true) -%}
|
||||
|
||||
{#- ======== Find last user message index ======== #}
|
||||
{%- set last_user_idx = namespace(value=-1) -%}
|
||||
{%- for message in messages -%}
|
||||
{%- if message.role == 'user' -%}
|
||||
{%- set last_user_idx.value = loop.index0 -%}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
|
||||
{#- ======== System messages renderers ======== #}
|
||||
{%- macro render_system_message(user_system_messages) %}
|
||||
{%- if default_system_prompt %}
|
||||
{%- if not sys_ns.is_first_block %}{{- "\n\n" }}{%- endif %}
|
||||
{%- set sys_ns.is_first_block = false %}
|
||||
{{- "## Provider System Prompt\n\nYou are Solar Open 100B, a large language model trained by Upstage AI, a Korean startup. Your knowledge cutoff is 2025-07. The current date is " + strftime_now("%Y-%m-%d") + "." }}
|
||||
{%- endif -%}
|
||||
{%- if user_system_messages %}
|
||||
{%- if not sys_ns.is_first_block %}{{- "\n\n" }}{%- endif %}
|
||||
{%- set sys_ns.is_first_block = false %}
|
||||
{{- "## System Prompt" }}
|
||||
{%- for system_message in user_system_messages %}
|
||||
{{- "\n\n" }}
|
||||
{{- system_message }}
|
||||
{%- endfor %}
|
||||
{%- endif -%}
|
||||
{%- endmacro %}
|
||||
|
||||
{%- macro render_tool_instruction(tools) %}
|
||||
{%- if not sys_ns.is_first_block %}{{- "\n\n" }}{%- endif %}
|
||||
{%- set sys_ns.is_first_block = false %}
|
||||
{{- "## Tools\n\n### Tool Call Instruction" }}
|
||||
{{- "\nYou may invoke one or more tools to assist with the user's query. Available tools are provided in JSON Schema format: <|tools:begin|><|tool:begin|><tools-json-object><|tool:end|>...<|tools:end|>\n" }}
|
||||
{{- "\n### Available Tools\n" }}
|
||||
{{- "<|tools:begin|>" }}
|
||||
{%- for tool in tools %}
|
||||
{{- "<|tool:begin|>" }}
|
||||
{{- tool.function | tojson }}
|
||||
{{- "<|tool:end|>" }}
|
||||
{%- endfor %}
|
||||
{{- "<|tools:end|>\n" }}
|
||||
{{- "\n### Tool Call Format\n" }}
|
||||
{{- "For each tool call, return a JSON object with the following structure, enclosed within <|tool_call:begin|> and <|tool_call:end|> tags: \n<|tool_call:begin|><tool-call-id><|tool_call:name|><tool-name><|tool_call:args|><args-json-object><|tool_call:end|>\n" }}
|
||||
{{- "- The <tool-call-id> must be a randomly generated string consisting of 10 lowercase letters (a-z) and/or digits (0-9) (e.g., a1b2c3d4e5)\n" }}
|
||||
{{- "\n### Tool Response Format\n" }}
|
||||
{{- "Each tool is responded by `tool` with the following structure:\n<|tool_response:id|><tool-call-id><|tool_response:name|><tool-name><|tool_response:result|><results><|tool_response:end|>\n" }}
|
||||
{{- "- Ensure the <tool-call-id> matches the corresponding tool call" -}}
|
||||
{%- endmacro %}
|
||||
|
||||
{%- macro render_json_response_format_instruction(response_format) %}
|
||||
{%- if not sys_ns.is_first_block %}{{- "\n\n" }}{%- endif %}
|
||||
{%- set sys_ns.is_first_block = false %}
|
||||
{{- "## Output Format Constraint" }}
|
||||
{{- "\n\nYour final response should follow the JSON schema: \n[Start of schema]" }}
|
||||
{{- response_format }}
|
||||
{{- "\n[End of schema]\nPlease ensure your answers adhere to this format and do not contain any unnecessary text." }}
|
||||
{%- endmacro %}
|
||||
|
||||
{%- macro get_tool_name(messages, tool_call_id) %}
|
||||
{%- for msg in messages -%}
|
||||
{%- if msg.role == 'assistant' and msg.tool_calls -%}
|
||||
{%- for tool_call in msg.tool_calls -%}
|
||||
{%- if tool_call.id == tool_call_id -%}
|
||||
{{- tool_call.function.name }}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
{%- endmacro %}
|
||||
|
||||
{%- macro render_tool_arguments(tool_arguments) %}
|
||||
{%- if tool_arguments is mapping -%}
|
||||
{{- tool_arguments | tojson }}
|
||||
{%- else -%}
|
||||
{{- tool_arguments }}
|
||||
{%- endif -%}
|
||||
{%- endmacro %}
|
||||
|
||||
{#- ======== Render system message ======== #}
|
||||
{%- set ns = namespace(system_messages=[]) -%}
|
||||
{%- for message in messages -%}
|
||||
{%- if message.role == 'system' -%}
|
||||
{%- set ns.system_messages = ns.system_messages + [message.content] -%}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
|
||||
{%- if ns.system_messages or default_system_prompt or tools or response_format -%}
|
||||
{{- "<|begin|>system<|content|>" }}
|
||||
{{- render_system_message(ns.system_messages) }}
|
||||
{%- if tools -%}
|
||||
{{- render_tool_instruction(tools) }}
|
||||
{%- endif %}
|
||||
{%- if response_format -%}
|
||||
{{- render_json_response_format_instruction(response_format) }}
|
||||
{%- endif %}
|
||||
{{- "<|end|>" }}
|
||||
{%- endif -%}
|
||||
|
||||
{#- ======== Render main messages ======== #}
|
||||
{%- for message in messages -%}
|
||||
{%- if message.role == 'user' -%}
|
||||
{{- "<|begin|>user<|content|>" + message.content + "<|end|>" }}
|
||||
{%- elif message.role == 'tool' -%}
|
||||
{%- set prev_is_tool = loop.index0 > 0 and messages[loop.index0 - 1].role == 'tool' -%}
|
||||
{%- set next_is_tool = loop.index0 < (messages | length - 1) and messages[loop.index0 + 1].role == 'tool' -%}
|
||||
{%- if not prev_is_tool -%}
|
||||
{{- "<|begin|>tool<|tool_response|>" }}
|
||||
{%- endif -%}
|
||||
{{- "<|tool_response:begin|>" + message.tool_call_id + "<|tool_response:name|>" }}
|
||||
{{- get_tool_name(messages, message.tool_call_id) }}
|
||||
{{- "<|tool_response:result|>" }}
|
||||
{{- message.content }}
|
||||
{{- "<|tool_response:end|>" }}
|
||||
{%- if not next_is_tool -%}
|
||||
{{- "<|end|>" }}
|
||||
{%- endif -%}
|
||||
{%- elif message.role == 'assistant' -%}
|
||||
{#- ======== Assistant Thinking ======== #}
|
||||
{%- if think_render_option == "all" -%}
|
||||
{%- if message.reasoning -%}
|
||||
{{- "<|begin|>assistant<|think|>" + message.reasoning + "<|end|>" }}
|
||||
{%- endif -%}
|
||||
{%- elif think_render_option == "lastthink" -%}
|
||||
{%- if message.reasoning and loop.index0 > last_user_idx.value -%}
|
||||
{{- "<|begin|>assistant<|think|>" + message.reasoning + "<|end|>" }}
|
||||
{%- endif -%}
|
||||
{%- endif -%}
|
||||
|
||||
{#- ======== Assistant Messages ======== #}
|
||||
{%- if message.tool_calls -%}
|
||||
{{- "<|begin|>assistant<|tool_calls|>" }}
|
||||
{%- for tool_call in message.tool_calls -%}
|
||||
{{- "<|tool_call:begin|>" + tool_call.id +"<|tool_call:name|>" + tool_call.function.name + "<|tool_call:args|>" }}
|
||||
{{- render_tool_arguments(tool_call.function.arguments) }}
|
||||
{{- "<|tool_call:end|>" }}
|
||||
{%- endfor -%}
|
||||
{{- "<|calls|>" }}
|
||||
{%- else -%}
|
||||
{{- "<|begin|>assistant<|content|>" + message.content + "<|end|>" }}
|
||||
{%- endif -%}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
|
||||
{%- if add_generation_prompt -%}
|
||||
{%- if reasoning_effort in ["low", "minimal"] -%}
|
||||
{{- "<|begin|>assistant<|think|><|end|>" }}
|
||||
{%- endif -%}
|
||||
{{- "<|begin|>assistant" }}
|
||||
{%- endif -%}
|
||||
40
scripts/snapdragon/windows/run-bench.ps1
Normal file
40
scripts/snapdragon/windows/run-bench.ps1
Normal file
@@ -0,0 +1,40 @@
|
||||
|
||||
#!/usr/bin/env pwsh
|
||||
|
||||
# Basedir on device
|
||||
$basedir=".\pkg-snapdragon"
|
||||
|
||||
$cli_opts=$args
|
||||
|
||||
$model="Llama-3.2-3B-Instruct-Q4_0.gguf"
|
||||
if ($null -ne $env:M) {
|
||||
$model=$env:M
|
||||
}
|
||||
|
||||
$device="HTP0"
|
||||
if ($null -ne $env:D) {
|
||||
$device=$env:D
|
||||
}
|
||||
|
||||
if ($null -ne $env:V) {
|
||||
$env:GGML_HEXAGON_VERBOSE=$env:V
|
||||
}
|
||||
|
||||
if ($null -ne $env:OPMASK) {
|
||||
$env:GGML_HEXAGON_OPMASK=$env:OPMASK
|
||||
}
|
||||
|
||||
if ($null -ne $env:NHVX) {
|
||||
$env:GGML_HEXAGON_NHVX=$env:NHVX
|
||||
}
|
||||
|
||||
if ($null -ne $env:NDEV) {
|
||||
$env:GGML_HEXAGON_NDEV=$env:NDEV
|
||||
}
|
||||
|
||||
$env:ADSP_LIBRARY_PATH="$basedir\lib"
|
||||
|
||||
& "$basedir\bin\llama-bench.exe" `
|
||||
--mmap 0 -m $basedir\..\..\gguf\$model `
|
||||
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 `
|
||||
--batch-size 128 -ngl 99 --device $device $cli_opts
|
||||
53
scripts/snapdragon/windows/run-cli.ps1
Normal file
53
scripts/snapdragon/windows/run-cli.ps1
Normal file
@@ -0,0 +1,53 @@
|
||||
|
||||
#!/usr/bin/env pwsh
|
||||
|
||||
# Basedir on device
|
||||
$basedir=".\pkg-snapdragon"
|
||||
|
||||
$cli_opts=$args
|
||||
|
||||
$model="Llama-3.2-3B-Instruct-Q4_0.gguf"
|
||||
if ($null -ne $env:M) {
|
||||
$model=$env:M
|
||||
}
|
||||
|
||||
$device="HTP0"
|
||||
if ($null -ne $env:D) {
|
||||
$device=$env:D
|
||||
}
|
||||
|
||||
if ($null -ne $env:V) {
|
||||
$env:GGML_HEXAGON_VERBOSE=$env:V
|
||||
}
|
||||
|
||||
if ($null -ne $env:E) {
|
||||
$env:GGML_HEXAGON_EXPERIMENTAL=$env:E
|
||||
}
|
||||
|
||||
if ($null -ne $env:SCHED) {
|
||||
$env:GGML_SCHED_DEBUG=$env:SCHED; $cli_opts="$cli_opts -v"
|
||||
}
|
||||
|
||||
if ($null -ne $env:PROF) {
|
||||
$env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1
|
||||
}
|
||||
|
||||
if ($null -ne $env:OPMASK) {
|
||||
$env:GGML_HEXAGON_OPMASK=$env:OPMASK
|
||||
}
|
||||
|
||||
if ($null -ne $env:NHVX) {
|
||||
$env:GGML_HEXAGON_NHVX=$env:NHVX
|
||||
}
|
||||
|
||||
if ($null -ne $env:NDEV) {
|
||||
$env:GGML_HEXAGON_NDEV=$env:NDEV
|
||||
}
|
||||
|
||||
$env:ADSP_LIBRARY_PATH="$basedir\lib"
|
||||
|
||||
& "$basedir\bin\llama-completion.exe" `
|
||||
--no-mmap -no-cnv -m $basedir\..\..\gguf\$model `
|
||||
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 `
|
||||
--ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on `
|
||||
-ngl 99 --device $device $cli_opts
|
||||
56
scripts/snapdragon/windows/run-tool.ps1
Normal file
56
scripts/snapdragon/windows/run-tool.ps1
Normal file
@@ -0,0 +1,56 @@
|
||||
|
||||
#!/usr/bin/env pwsh
|
||||
|
||||
# Basedir on device
|
||||
$basedir=".\pkg-snapdragon"
|
||||
|
||||
if ($args.Count -eq 0) {
|
||||
Write-Host "No arguments provided.Expected the tool and argument to run."
|
||||
exit -1
|
||||
}
|
||||
|
||||
$tool=$args[0]
|
||||
$cli_opts=@()
|
||||
|
||||
if ($args.Count -gt 1) {
|
||||
$cli_opts=$args[1..($args.Count - 1)]
|
||||
$remainingArgs = $args[1..($args.Count - 1)]
|
||||
}
|
||||
|
||||
$device="HTP0"
|
||||
if ($null -ne $env:D) {
|
||||
$device=$env:D
|
||||
}
|
||||
|
||||
if ($null -ne $env:V) {
|
||||
$env:GGML_HEXAGON_VERBOSE=$env:V
|
||||
}
|
||||
|
||||
if ($null -ne $env:E) {
|
||||
$env:GGML_HEXAGON_EXPERIMENTAL=$env:E
|
||||
}
|
||||
|
||||
if ($null -ne $env:SCHED) {
|
||||
$env:GGML_SCHED_DEBUG=$env:SCHED; $cli_opts="$cli_opts -v"
|
||||
}
|
||||
|
||||
if ($null -ne $env:PROF) {
|
||||
$env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1
|
||||
}
|
||||
|
||||
if ($null -ne $env:OPMASK) {
|
||||
$env:GGML_HEXAGON_OPMASK=$env:OPMASK
|
||||
}
|
||||
|
||||
if ($null -ne $env:NHVX) {
|
||||
$env:GGML_HEXAGON_NHVX=$env:NHVX
|
||||
}
|
||||
|
||||
if ($null -ne $env:NDEV) {
|
||||
$env:GGML_HEXAGON_NDEV=$env:NDEV
|
||||
}
|
||||
|
||||
$env:ADSP_LIBRARY_PATH="$basedir\lib"
|
||||
|
||||
& "$basedir\bin\$tool" `
|
||||
$cli_opts
|
||||
105
scripts/snapdragon/windows/setup-build.ps1
Normal file
105
scripts/snapdragon/windows/setup-build.ps1
Normal file
@@ -0,0 +1,105 @@
|
||||
# Requires Run as Administrator is NOT strictly necessary for User-scope env vars,
|
||||
# but recommended for creating directories in C:\ root if permissions are restricted.
|
||||
|
||||
$ErrorActionPreference = "Stop"
|
||||
|
||||
# --- Configuration ---
|
||||
$BaseDir = "C:\Qualcomm"
|
||||
|
||||
# SDK 1: Hexagon
|
||||
$HexagonUrl = "https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v6.4.0.2/hexagon-sdk-v6.4.0.2-arm64-wos.tar.xz"
|
||||
$HexagonParent = Join-Path $BaseDir "Hexagon_SDK"
|
||||
$HexagonSdkVersion = "6.4.0.2"
|
||||
$HexagonToolsVersion = "19.0.04"
|
||||
$HexagonSdkTarget = Join-Path $HexagonParent $HexagonSdkVersion
|
||||
$HexagonToolsTarget = Join-Path $HexagonSdkTarget "\tools\HEXAGON_Tools\$HexagonToolsVersion"
|
||||
|
||||
# SDK 2: OpenCL
|
||||
$OpenCLUrl = "https://github.com/snapdragon-toolchain/opencl-sdk/releases/download/v2.3.2/adreno-opencl-sdk-v2.3.2-arm64-wos.tar.xz"
|
||||
$OpenCLParent = Join-Path $BaseDir "OpenCL_SDK"
|
||||
$OpenCLVersion = "2.3.2"
|
||||
$OpenCLTarget = Join-Path $OpenCLParent $OpenCLVersion
|
||||
|
||||
# --- Helper Function ---
|
||||
function Install-QualcommSDK {
|
||||
param (
|
||||
[string]$Url,
|
||||
[string]$ParentDir,
|
||||
[string]$TargetDir,
|
||||
[string]$Name
|
||||
)
|
||||
|
||||
# 1. Create Parent Directory
|
||||
if (-not (Test-Path -Path $ParentDir)) {
|
||||
Write-Host "Creating directory: $ParentDir" -ForegroundColor Cyan
|
||||
New-Item -Path $ParentDir -ItemType Directory -Force | Out-Null
|
||||
}
|
||||
|
||||
# 2. Check for Specific Version Directory
|
||||
if (Test-Path -Path $TargetDir) {
|
||||
Write-Host "$Name ($TargetDir) already exists. Skipping download." -ForegroundColor Green
|
||||
}
|
||||
else {
|
||||
Write-Host "$Name not found. preparing to download..." -ForegroundColor Yellow
|
||||
|
||||
# Create the target directory to extract into
|
||||
New-Item -Path $TargetDir -ItemType Directory -Force | Out-Null
|
||||
|
||||
# Define temporary archive path
|
||||
$TempFile = Join-Path $ParentDir "temp_sdk.tar.xz"
|
||||
|
||||
try {
|
||||
# Download
|
||||
Write-Host "Downloading from: $Url"
|
||||
Invoke-WebRequest -Uri $Url -OutFile $TempFile
|
||||
|
||||
# Untar
|
||||
# Note: We assume Windows includes tar.exe (Win 10 build 17063+)
|
||||
Write-Host "Extracting archive to $TargetDir..."
|
||||
|
||||
# We use -C to extract contents INTO the target directory created above
|
||||
tar -xJvf $TempFile -C $TargetDir\..
|
||||
|
||||
Write-Host "Extraction complete." -ForegroundColor Green
|
||||
}
|
||||
catch {
|
||||
Write-Error "Failed to download or extract $Name. Error: $_"
|
||||
# Cleanup target dir if failed so script tries again next time
|
||||
Remove-Item -Path $TargetDir -Recurse -Force -ErrorAction SilentlyContinue
|
||||
}
|
||||
finally {
|
||||
# Cleanup Archive
|
||||
if (Test-Path $TempFile) { Remove-Item $TempFile -Force }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# --- Execution ---
|
||||
|
||||
# 1. Ensure Base C:\Qualcomm exists
|
||||
if (-not (Test-Path $BaseDir)) {
|
||||
New-Item -Path $BaseDir -ItemType Directory -Force | Out-Null
|
||||
}
|
||||
|
||||
# 2. Run Install Logic
|
||||
Install-QualcommSDK -Url $HexagonUrl -ParentDir $HexagonParent -TargetDir $HexagonSdkTarget -Name "Hexagon SDK"
|
||||
Install-QualcommSDK -Url $OpenCLUrl -ParentDir $OpenCLParent -TargetDir $OpenCLTarget -Name "OpenCL SDK"
|
||||
|
||||
# --- Environment Variables ---
|
||||
|
||||
Write-Host "`nSetting Environment Variables..." -ForegroundColor Cyan
|
||||
|
||||
# Set OPENCL_SDK_ROOT
|
||||
[System.Environment]::SetEnvironmentVariable('OPENCL_SDK_ROOT', $OpenCLTarget, [System.EnvironmentVariableTarget]::User)
|
||||
$env:OPENCL_SDK_ROOT = $OpenCLTarget # Set for current session as well
|
||||
Write-Host "OPENCL_SDK_ROOT set to: $OpenCLTarget"
|
||||
|
||||
# Set HEXAGON_SDK_ROOT
|
||||
[System.Environment]::SetEnvironmentVariable('HEXAGON_SDK_ROOT', $HexagonSdkTarget, [System.EnvironmentVariableTarget]::User)
|
||||
$env:HEXAGON_SDK_ROOT = $HexagonSdkTarget # Set for current session as well
|
||||
Write-Host "HEXAGON_SDK_ROOT set to: $HexagonSdkTarget"
|
||||
|
||||
# Set HEXAGON_SDK_ROOT
|
||||
[System.Environment]::SetEnvironmentVariable('HEXAGON_TOOLS_ROOT', $HexagonToolsTarget, [System.EnvironmentVariableTarget]::User)
|
||||
$env:HEXAGON_TOOLS_ROOT = $HexagonToolsTarget # Set for current session as well
|
||||
Write-Host "HEXAGON_TOOLS_ROOT set to: $HexagonToolsTarget"
|
||||
@@ -1 +1 @@
|
||||
ebc3a0f4a56be1c9424a89fbec09962ac34fde85
|
||||
a8db410a252c8c8f2d120c6f2e7133ebe032f35d
|
||||
|
||||
@@ -1630,11 +1630,6 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
||||
hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
|
||||
cb(cur, LLAMA_TENSOR_NAME_FATTN, il);
|
||||
|
||||
if (!cparams.offload_kqv) {
|
||||
// all nodes between the KV store and the attention output are run on the CPU
|
||||
ggml_backend_sched_set_tensor_backend(sched, cur, backend_cpu);
|
||||
}
|
||||
|
||||
ggml_flash_attn_ext_add_sinks(cur, sinks);
|
||||
ggml_flash_attn_ext_set_prec (cur, GGML_PREC_F32);
|
||||
|
||||
|
||||
@@ -1772,8 +1772,6 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
|
||||
io.write(&v_trans, sizeof(v_trans));
|
||||
io.write(&n_layer, sizeof(n_layer));
|
||||
|
||||
std::vector<uint8_t> tmp_buf;
|
||||
|
||||
// Iterate and write all the keys first, each row is a cell
|
||||
// Get whole range at a time
|
||||
for (const auto & layer : layers) {
|
||||
@@ -1791,7 +1789,7 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
|
||||
const uint64_t k_size_row = ggml_row_size(k->type, n_embd_k_gqa);
|
||||
io.write(&k_size_row, sizeof(k_size_row));
|
||||
|
||||
// Read each range of cells of k_size length each into tmp_buf and write out
|
||||
// Read each range of cells of k_size length and write out
|
||||
for (const auto & range : cr.data) {
|
||||
const size_t range_size = range.second - range.first;
|
||||
const size_t buf_size = range_size * k_size_row;
|
||||
@@ -1818,7 +1816,7 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
|
||||
const uint64_t v_size_row = ggml_row_size(v->type, n_embd_v_gqa);
|
||||
io.write(&v_size_row, sizeof(v_size_row));
|
||||
|
||||
// Read each range of cells of v_size length each into tmp_buf and write out
|
||||
// Read each range of cells of v_size length and write out
|
||||
for (const auto & range : cr.data) {
|
||||
const size_t range_size = range.second - range.first;
|
||||
const size_t buf_size = range_size * v_size_row;
|
||||
@@ -1852,7 +1850,7 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
|
||||
|
||||
// For each row, we get the element values of each cell
|
||||
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
||||
// Read each range of cells of v_size_el length each into tmp_buf and write out
|
||||
// Read each range of cells of v_size_el length and write out
|
||||
for (const auto & range : cr.data) {
|
||||
const size_t range_size = range.second - range.first;
|
||||
const size_t src_offset = (range.first + j * kv_size) * v_size_el;
|
||||
|
||||
@@ -785,23 +785,21 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
|
||||
io.write(&s_trans, sizeof(s_trans));
|
||||
io.write(&n_layer, sizeof(n_layer));
|
||||
|
||||
std::vector<uint8_t> tmp_buf;
|
||||
|
||||
// Iterate and write all the keys first, each row is a cell
|
||||
// Iterate and write all the R tensors first, each row is a cell
|
||||
// Get whole range at a time
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
// skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
|
||||
if (r_l[il] == nullptr) continue;
|
||||
|
||||
// Write key type
|
||||
// Write R tensor type
|
||||
const int32_t r_type_i = (int32_t)r_l[il]->type;
|
||||
io.write(&r_type_i, sizeof(r_type_i));
|
||||
|
||||
// Write row size of key
|
||||
// Write row size of R tensor
|
||||
const uint64_t r_size_row = ggml_row_size(r_l[il]->type, hparams.n_embd_r());
|
||||
io.write(&r_size_row, sizeof(r_size_row));
|
||||
|
||||
// Read each range of cells of k_size length each into tmp_buf and write out
|
||||
// Write each range of cells of r_size_row length
|
||||
for (const auto & range : cell_ranges) {
|
||||
const size_t range_size = range.second - range.first;
|
||||
const size_t buf_size = range_size * r_size_row;
|
||||
@@ -814,15 +812,15 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
|
||||
// skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
|
||||
if (s_l[il] == nullptr) continue;
|
||||
|
||||
// Write value type
|
||||
// Write S tensor type
|
||||
const int32_t s_type_i = (int32_t)s_l[il]->type;
|
||||
io.write(&s_type_i, sizeof(s_type_i));
|
||||
|
||||
// Write row size of value
|
||||
// Write row size of S tensor
|
||||
const uint64_t s_size_row = ggml_row_size(s_l[il]->type, hparams.n_embd_s());
|
||||
io.write(&s_size_row, sizeof(s_size_row));
|
||||
|
||||
// Read each range of cells of s_size length each into tmp_buf and write out
|
||||
// Write each range of S tensor rows
|
||||
for (const auto & range : cell_ranges) {
|
||||
const size_t range_size = range.second - range.first;
|
||||
const size_t buf_size = range_size * s_size_row;
|
||||
@@ -830,7 +828,7 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// When v is transposed, we also need the element size and get the element ranges from each row
|
||||
// When S tensor is transposed, we also need the element size and get the element ranges from each row
|
||||
const uint32_t mem_size = size;
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
// skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
|
||||
@@ -838,7 +836,7 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
|
||||
|
||||
const uint32_t n_embd_s = hparams.n_embd_s();
|
||||
|
||||
// Write value type
|
||||
// Write S tensor type
|
||||
const int32_t s_type_i = (int32_t)s_l[il]->type;
|
||||
io.write(&s_type_i, sizeof(s_type_i));
|
||||
|
||||
@@ -851,7 +849,7 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
|
||||
|
||||
// For each row, we get the element values of each cell
|
||||
for (uint32_t j = 0; j < n_embd_s; ++j) {
|
||||
// Read each range of cells of v_size_el length each into tmp_buf and write out
|
||||
// Write each range of cells of s_size_el length
|
||||
for (const auto & range : cell_ranges) {
|
||||
const size_t range_size = range.second - range.first;
|
||||
const size_t src_offset = (range.first + j * mem_size) * s_size_el;
|
||||
|
||||
@@ -8213,11 +8213,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
if (!mask && max_bias > 0.0f) continue;
|
||||
for (float logit_softcap : {0.0f, 10.0f}) {
|
||||
if (hsk != 128 && logit_softcap != 0.0f) continue;
|
||||
for (int nh : { 4, }) {
|
||||
for (int nh : { 1, 4 }) {
|
||||
if (nh == 1 && hsk != 576) continue; // GLM 4.7 Flash
|
||||
for (int nr3 : { 1, 3, }) {
|
||||
if (hsk > 64 && nr3 > 1) continue; // skip broadcast for large head sizes
|
||||
for (int nr2 : { 1, 4, 12 }) {
|
||||
for (int nr2 : { 1, 4, 12, 20 }) {
|
||||
if (nr2 == 12 && hsk != 128) continue;
|
||||
if (nr2 == 20 && (nh != 1 || hsk != 576)) continue;
|
||||
//for (int kv : { 1, 17, 31, 33, 61, 113, 65, 127, 129, 130, 255, 260, 371, 380, 407, 512, 1024, }) {
|
||||
for (int kv : { 113, 512, 1024, }) {
|
||||
if (nr2 != 1 && kv != 512) continue;
|
||||
|
||||
@@ -54,7 +54,6 @@ std::string DEFAULT_JSON = R"({
|
||||
],
|
||||
"bos_token": "<s>",
|
||||
"eos_token": "</s>",
|
||||
"tools": [],
|
||||
"add_generation_prompt": true
|
||||
})";
|
||||
|
||||
@@ -481,7 +480,7 @@ int main_automated_tests(void) {
|
||||
/* .name= */ "Mistral-Large-Instruct-2407 (mistralai 'v3' template; modified to have system prompt at start)",
|
||||
/* .template_str= */ "{%- if messages[0][\"role\"] == \"system\" %}\n {%- set system_message = messages[0][\"content\"] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr(\"role\", \"equalto\", \"user\") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n {%- if not (message.role == \"tool\" or message.role == \"tool_results\" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n {%- if (message[\"role\"] == \"user\") != (ns.index % 2 == 0) %}\n {{- raise_exception(\"After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\") }}\n {%- endif %}\n {%- set ns.index = ns.index + 1 %}\n {%- endif %}\n{%- endfor %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n {%- if message[\"role\"] == \"user\" %}\n {%- if tools is not none and (message == user_messages[-1]) %}\n {{- \"[AVAILABLE_TOOLS] [\" }}\n {%- for tool in tools %}\n {%- set tool = tool.function %}\n {{- '{\"type\": \"function\", \"function\": {' }}\n {%- for key, val in tool.items() if key != \"return\" %}\n {%- if val is string %}\n {{- '\"' + key + '\": \"' + val + '\"' }}\n {%- else %}\n {{- '\"' + key + '\": ' + val|tojson }}\n {%- endif %}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \"}}\" }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- else %}\n {{- \"]\" }}\n {%- endif %}\n {%- endfor %}\n {{- \"[/AVAILABLE_TOOLS]\" }}\n {%- endif %}\n {%- if loop.last and system_message is defined %}\n {{- \"[INST] \" + system_message + \"\\n\\n\" + message[\"content\"] + \"[/INST]\" }}\n {%- else %}\n {{- \"[INST] \" + message[\"content\"] + \"[/INST]\" }}\n {%- endif %}\n {%- elif message.tool_calls is defined and message.tool_calls is not none %}\n {{- \"[TOOL_CALLS] [\" }}\n {%- for tool_call in message.tool_calls %}\n {%- set out = tool_call.function|tojson %}\n {{- out[:-1] }}\n {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n {%- endif %}\n {{- ', \"id\": \"' + tool_call.id + '\"}' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- else %}\n {{- \"]\" + eos_token }}\n {%- endif %}\n {%- endfor %}\n {%- elif message[\"role\"] == \"assistant\" %}\n {{- \" \" + message[\"content\"]|trim + eos_token}}\n {%- elif message[\"role\"] == \"tool_results\" or message[\"role\"] == \"tool\" %}\n {%- if message.content is defined and message.content.content is defined %}\n {%- set content = message.content.content %}\n {%- else %}\n {%- set content = message.content %}\n {%- endif %}\n {{- '[TOOL_RESULTS] {\"content\": ' + content|string + \", \" }}\n {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n {%- endif %}\n {{- '\"call_id\": \"' + message.tool_call_id + '\"}[/TOOL_RESULTS]' }}\n {%- else %}\n {{- raise_exception(\"Only user and assistant roles are supported, with the exception of an initial optional system message!\") }}\n {%- endif %}\n{%- endfor %}\n",
|
||||
/* .expected_output= */ "[INST] You are a helpful assistant\n\nHello[/INST] Hi there</s>[INST] Who are you[/INST] I am an assistant</s>[INST] Another question[/INST]",
|
||||
/* .expected_output_jinja= */ "[INST] Hello[/INST] Hi there</s>[INST] Who are you[/INST] I am an assistant</s>[AVAILABLE_TOOLS] [[/AVAILABLE_TOOLS][INST] You are a helpful assistant\n\nAnother question[/INST]",
|
||||
/* .expected_output_jinja= */ "[INST] Hello[/INST] Hi there</s>[INST] Who are you[/INST] I am an assistant</s>[INST] You are a helpful assistant\n\nAnother question[/INST]",
|
||||
/* .bos_token= */ "",
|
||||
/* .eos_token= */ "</s>",
|
||||
},
|
||||
@@ -489,7 +488,7 @@ int main_automated_tests(void) {
|
||||
/* .name= */ "Mistral-Nemo-Instruct-2407 (mistralai 'v3-tekken' template; modified to have system prompt at start)",
|
||||
/* .template_str= */ "{%- if messages[0][\"role\"] == \"system\" %}\n {%- set system_message = messages[0][\"content\"] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr(\"role\", \"equalto\", \"user\") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n {%- if not (message.role == \"tool\" or message.role == \"tool_results\" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n {%- if (message[\"role\"] == \"user\") != (ns.index % 2 == 0) %}\n {{- raise_exception(\"After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\") }}\n {%- endif %}\n {%- set ns.index = ns.index + 1 %}\n {%- endif %}\n{%- endfor %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n {%- if message[\"role\"] == \"user\" %}\n {%- if tools is not none and (message == user_messages[-1]) %}\n {{- \"[AVAILABLE_TOOLS][\" }}\n {%- for tool in tools %}\n {%- set tool = tool.function %}\n {{- '{\"type\": \"function\", \"function\": {' }}\n {%- for key, val in tool.items() if key != \"return\" %}\n {%- if val is string %}\n {{- '\"' + key + '\": \"' + val + '\"' }}\n {%- else %}\n {{- '\"' + key + '\": ' + val|tojson }}\n {%- endif %}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \"}}\" }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- else %}\n {{- \"]\" }}\n {%- endif %}\n {%- endfor %}\n {{- \"[/AVAILABLE_TOOLS]\" }}\n {%- endif %}\n {%- if loop.last and system_message is defined %}\n {{- \"[INST]\" + system_message + \"\\n\\n\" + message[\"content\"] + \"[/INST]\" }}\n {%- else %}\n {{- \"[INST]\" + message[\"content\"] + \"[/INST]\" }}\n {%- endif %}\n {%- elif (message.tool_calls is defined and message.tool_calls is not none) %}\n {{- \"[TOOL_CALLS][\" }}\n {%- for tool_call in message.tool_calls %}\n {%- set out = tool_call.function|tojson %}\n {{- out[:-1] }}\n {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n {%- endif %}\n {{- ', \"id\": \"' + tool_call.id + '\"}' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- else %}\n {{- \"]\" + eos_token }}\n {%- endif %}\n {%- endfor %}\n {%- elif message[\"role\"] == \"assistant\" %}\n {{- message[\"content\"] + eos_token}}\n {%- elif message[\"role\"] == \"tool_results\" or message[\"role\"] == \"tool\" %}\n {%- if message.content is defined and message.content.content is defined %}\n {%- set content = message.content.content %}\n {%- else %}\n {%- set content = message.content %}\n {%- endif %}\n {{- '[TOOL_RESULTS]{\"content\": ' + content|string + \", \" }}\n {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n {%- endif %}\n {{- '\"call_id\": \"' + message.tool_call_id + '\"}[/TOOL_RESULTS]' }}\n {%- else %}\n {{- raise_exception(\"Only user and assistant roles are supported, with the exception of an initial optional system message!\") }}\n {%- endif %}\n{%- endfor %}\n",
|
||||
/* .expected_output= */ "[INST]You are a helpful assistant\n\nHello[/INST]Hi there</s>[INST]Who are you[/INST] I am an assistant </s>[INST]Another question[/INST]",
|
||||
/* .expected_output_jinja= */ "[INST]Hello[/INST]Hi there</s>[INST]Who are you[/INST] I am an assistant </s>[AVAILABLE_TOOLS][[/AVAILABLE_TOOLS][INST]You are a helpful assistant\n\nAnother question[/INST]",
|
||||
/* .expected_output_jinja= */ "[INST]Hello[/INST]Hi there</s>[INST]Who are you[/INST] I am an assistant </s>[INST]You are a helpful assistant\n\nAnother question[/INST]",
|
||||
/* .bos_token= */ "",
|
||||
/* .eos_token= */ "</s>",
|
||||
},
|
||||
|
||||
@@ -592,7 +592,7 @@ static void test_peg_parser(common_chat_templates * tmpls, const std::function<v
|
||||
}
|
||||
if (diff.tool_call_index != std::string::npos) {
|
||||
if (!diff.tool_call_delta.name.empty()) {
|
||||
msg_accum.tool_calls.push_back({diff.tool_call_delta.name, "", ""});
|
||||
msg_accum.tool_calls.push_back({diff.tool_call_delta.name, "", diff.tool_call_delta.id});
|
||||
}
|
||||
if (!diff.tool_call_delta.arguments.empty()) {
|
||||
msg_accum.tool_calls.back().arguments += diff.tool_call_delta.arguments;
|
||||
@@ -3799,6 +3799,134 @@ static void test_template_output_peg_parsers() {
|
||||
});
|
||||
}
|
||||
|
||||
{
|
||||
// Solar-Open-100B
|
||||
auto tmpls = read_templates("models/templates/upstage-Solar-Open-100B.jinja");
|
||||
|
||||
// Test basic message
|
||||
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||
t.input = "<|content|>Hello, world!\nWhat's up?";
|
||||
t.expect = message_assist;
|
||||
});
|
||||
|
||||
// Test basic message and reasoning
|
||||
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||
t.input = "<|think|>I'm\nthinking<|end|><|begin|>assistant<|content|>Hello, world!\nWhat's up?";
|
||||
t.expect = message_assist_thoughts;
|
||||
});
|
||||
|
||||
// Test basic message and reasoning_effort = low
|
||||
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||
t.input = "<|content|>Hello, world!\nWhat's up?";
|
||||
t.params.chat_template_kwargs["reasoning_effort"] = "\"low\"";
|
||||
t.expect = message_assist;
|
||||
});
|
||||
|
||||
// Test tool call
|
||||
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||
t.input = "<|tool_calls|>"
|
||||
"<|tool_call:begin|>123456789"
|
||||
"<|tool_call:name|>special_function"
|
||||
"<|tool_call:args|>{\"arg1\":1}"
|
||||
"<|tool_call:end|>";
|
||||
|
||||
t.params.chat_template_kwargs["reasoning_effort"] = "\"low\"";
|
||||
t.params.tools = {special_function_tool};
|
||||
t.expect = message_assist_call_id;
|
||||
});
|
||||
|
||||
// Test tool call with reasoning
|
||||
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||
t.input = "<|think|>I'm\nthinking<|end|>"
|
||||
"<|begin|>assistant<|tool_calls|>"
|
||||
"<|tool_call:begin|>0"
|
||||
"<|tool_call:name|>special_function"
|
||||
"<|tool_call:args|>{\"arg1\":1}"
|
||||
"<|tool_call:end|>";
|
||||
|
||||
t.params.tools = {special_function_tool};
|
||||
t.expect = message_assist_thoughts_call_idx;
|
||||
});
|
||||
|
||||
// Test tool call with reasoning and tool_choice = required
|
||||
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||
t.input = "<|think|>I'm\nthinking<|end|>"
|
||||
"<|begin|>assistant<|tool_calls|>"
|
||||
"<|tool_call:begin|>0"
|
||||
"<|tool_call:name|>special_function"
|
||||
"<|tool_call:args|>{\"arg1\":1}"
|
||||
"<|tool_call:end|>";
|
||||
|
||||
t.params.tools = {special_function_tool};
|
||||
t.params.tool_choice = COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||
t.expect = message_assist_thoughts_call_idx;
|
||||
});
|
||||
|
||||
// Test tool call without reasoning and tool_choice = required
|
||||
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||
t.input = "<|tool_calls|>"
|
||||
"<|tool_call:begin|>0"
|
||||
"<|tool_call:name|>special_function"
|
||||
"<|tool_call:args|>{\"arg1\":1}"
|
||||
"<|tool_call:end|>";
|
||||
|
||||
t.params.tools = {special_function_tool};
|
||||
t.params.tool_choice = COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||
t.params.chat_template_kwargs["reasoning_effort"] = "\"low\"";
|
||||
t.expect = message_assist_call_idx;
|
||||
});
|
||||
|
||||
// Test parallel tool calls
|
||||
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||
t.input = "<|think|>I'm\nthinking<|end|>"
|
||||
"<|begin|>assistant<|tool_calls|>"
|
||||
"<|tool_call:begin|>0"
|
||||
"<|tool_call:name|>special_function"
|
||||
"<|tool_call:args|>{\"arg1\":1}"
|
||||
"<|tool_call:end|>"
|
||||
"<|tool_call:begin|>1"
|
||||
"<|tool_call:name|>special_function_with_opt"
|
||||
"<|tool_call:args|>{\"arg1\": 1, \"arg2\": 2}"
|
||||
"<|tool_call:end|>";
|
||||
|
||||
t.params.parallel_tool_calls = true;
|
||||
t.params.tools = {special_function_tool, special_function_tool_with_optional_param};
|
||||
|
||||
t.expect.reasoning_content = "I'm\nthinking";
|
||||
t.expect.tool_calls = {{
|
||||
/* .name = */ "special_function",
|
||||
/* .arguments = */ R"({"arg1": 1})",
|
||||
/* .id = */ "0",
|
||||
}, {
|
||||
/* .name = */ "special_function_with_opt",
|
||||
/* .arguments = */ R"({"arg1": 1, "arg2": 2})",
|
||||
/* .id = */ "1",
|
||||
}};
|
||||
});
|
||||
|
||||
// Test response format
|
||||
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||
t.input = "<|think|>I need to output the invoice details in JSON<|end|>"
|
||||
"<|begin|>assistant<|content|>"
|
||||
R"({"amount": 123.45, "date": "2025-12-03"})";
|
||||
|
||||
t.params.json_schema = invoice_schema;
|
||||
|
||||
t.expect.reasoning_content = "I need to output the invoice details in JSON";
|
||||
t.expect.content =R"({"amount": 123.45, "date": "2025-12-03"})";
|
||||
});
|
||||
|
||||
// Test response format no reasoning
|
||||
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||
t.input = "<|content|>"
|
||||
R"({"amount": 123.45, "date": "2025-12-03"})";
|
||||
|
||||
t.params.chat_template_kwargs["reasoning_effort"] = "\"low\"";
|
||||
t.params.json_schema = invoice_schema;
|
||||
|
||||
t.expect.content =R"({"amount": 123.45, "date": "2025-12-03"})";
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
static void test_msg_diffs_compute() {
|
||||
|
||||
Binary file not shown.
@@ -155,7 +155,7 @@ struct server_slot {
|
||||
double t_prompt_processing; // ms
|
||||
double t_token_generation; // ms
|
||||
|
||||
std::function<void(int /* slot_id */)> callback_on_release;
|
||||
std::function<void(int /* id_slot */)> callback_on_release;
|
||||
|
||||
// Speculative decoding stats
|
||||
int32_t n_draft_total = 0; // Total draft tokens generated
|
||||
@@ -705,6 +705,11 @@ private:
|
||||
params_base.n_cache_reuse = 0;
|
||||
SRV_WRN("%s\n", "cache_reuse is not supported by multimodal, it will be disabled");
|
||||
}
|
||||
|
||||
if (params_base.speculative.type != COMMON_SPECULATIVE_TYPE_NONE) {
|
||||
params_base.speculative.type = COMMON_SPECULATIVE_TYPE_NONE;
|
||||
SRV_WRN("%s\n", "speculative decoding is not supported by multimodal, it will be disabled");
|
||||
}
|
||||
}
|
||||
|
||||
if (!llama_memory_can_shift(llama_get_memory(ctx))) {
|
||||
@@ -754,16 +759,16 @@ private:
|
||||
SRV_ERR("%s\n", "speculative decoding is not supported with multimodal");
|
||||
return false;
|
||||
}
|
||||
SRV_WRN("%s", "speculative decoding context initialized\n");
|
||||
SLT_INF(slot, "%s", "speculative decoding context initialized\n");
|
||||
} else {
|
||||
SRV_WRN("%s", "speculative decoding context not initialized\n");
|
||||
SLT_INF(slot, "%s", "speculative decoding context not initialized\n");
|
||||
}
|
||||
}
|
||||
|
||||
SLT_INF(slot, "new slot, n_ctx = %d\n", slot.n_ctx);
|
||||
|
||||
slot.callback_on_release = [this](int slot_id) {
|
||||
queue_tasks.pop_deferred_task(slot_id);
|
||||
slot.callback_on_release = [this](int id_slot) {
|
||||
queue_tasks.pop_deferred_task(id_slot);
|
||||
};
|
||||
|
||||
slot.reset();
|
||||
@@ -891,6 +896,9 @@ private:
|
||||
}
|
||||
|
||||
server_slot * get_slot_by_id(int id_slot) {
|
||||
// note: allow id_slot to be out of bounds (wrap around)
|
||||
id_slot = id_slot % slots.size();
|
||||
|
||||
for (server_slot & slot : slots) {
|
||||
if (slot.id == id_slot) {
|
||||
return &slot;
|
||||
@@ -1760,7 +1768,7 @@ private:
|
||||
break;
|
||||
}
|
||||
|
||||
int id_slot = task.slot_action.slot_id;
|
||||
const int id_slot = task.slot_action.id_slot;
|
||||
server_slot * slot = get_slot_by_id(id_slot);
|
||||
if (slot == nullptr) {
|
||||
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
|
||||
@@ -1798,7 +1806,7 @@ private:
|
||||
case SERVER_TASK_TYPE_SLOT_RESTORE:
|
||||
{
|
||||
if (!check_no_mtmd(task.id)) break;
|
||||
int id_slot = task.slot_action.slot_id;
|
||||
const int id_slot = task.slot_action.id_slot;
|
||||
server_slot * slot = get_slot_by_id(id_slot);
|
||||
if (slot == nullptr) {
|
||||
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
|
||||
@@ -1847,7 +1855,7 @@ private:
|
||||
if (!check_no_mtmd(task.id)) {
|
||||
break;
|
||||
}
|
||||
int id_slot = task.slot_action.slot_id;
|
||||
const int id_slot = task.slot_action.id_slot;
|
||||
server_slot * slot = get_slot_by_id(id_slot);
|
||||
if (slot == nullptr) {
|
||||
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
|
||||
@@ -3312,7 +3320,7 @@ void server_routes::init_routes() {
|
||||
}
|
||||
|
||||
// TODO: get rid of this dynamic_cast
|
||||
auto res_task = dynamic_cast<server_task_result_metrics*>(result.get());
|
||||
auto * res_task = dynamic_cast<server_task_result_metrics*>(result.get());
|
||||
GGML_ASSERT(res_task != nullptr);
|
||||
|
||||
// optionally return "fail_on_no_slot" error
|
||||
@@ -3335,8 +3343,8 @@ void server_routes::init_routes() {
|
||||
}
|
||||
|
||||
std::string id_slot_str = req.get_param("id_slot");
|
||||
int id_slot;
|
||||
|
||||
int id_slot;
|
||||
try {
|
||||
id_slot = std::stoi(id_slot_str);
|
||||
} catch (const std::exception &) {
|
||||
@@ -3348,14 +3356,16 @@ void server_routes::init_routes() {
|
||||
|
||||
if (action == "save") {
|
||||
return handle_slots_save(req, id_slot);
|
||||
} else if (action == "restore") {
|
||||
return handle_slots_restore(req, id_slot);
|
||||
} else if (action == "erase") {
|
||||
return handle_slots_erase(req, id_slot);
|
||||
} else {
|
||||
res->error(format_error_response("Invalid action", ERROR_TYPE_INVALID_REQUEST));
|
||||
return res;
|
||||
}
|
||||
if (action == "restore") {
|
||||
return handle_slots_restore(req, id_slot);
|
||||
}
|
||||
if (action == "erase") {
|
||||
return handle_slots_erase(req, id_slot);
|
||||
}
|
||||
|
||||
res->error(format_error_response("Invalid action", ERROR_TYPE_INVALID_REQUEST));
|
||||
return res;
|
||||
};
|
||||
|
||||
this->get_props = [this](const server_http_req &) {
|
||||
@@ -3898,7 +3908,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_slots_save(const ser
|
||||
{
|
||||
server_task task(SERVER_TASK_TYPE_SLOT_SAVE);
|
||||
task.id = rd.get_new_id();
|
||||
task.slot_action.slot_id = id_slot;
|
||||
task.slot_action.id_slot = id_slot;
|
||||
task.slot_action.filename = filename;
|
||||
task.slot_action.filepath = filepath;
|
||||
rd.post_task(std::move(task));
|
||||
@@ -3934,7 +3944,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_slots_restore(const
|
||||
{
|
||||
server_task task(SERVER_TASK_TYPE_SLOT_RESTORE);
|
||||
task.id = rd.get_new_id();
|
||||
task.slot_action.slot_id = id_slot;
|
||||
task.slot_action.id_slot = id_slot;
|
||||
task.slot_action.filename = filename;
|
||||
task.slot_action.filepath = filepath;
|
||||
rd.post_task(std::move(task));
|
||||
@@ -3963,7 +3973,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_slots_erase(const se
|
||||
{
|
||||
server_task task(SERVER_TASK_TYPE_SLOT_ERASE);
|
||||
task.id = rd.get_new_id();
|
||||
task.slot_action.slot_id = id_slot;
|
||||
task.slot_action.id_slot = id_slot;
|
||||
rd.post_task(std::move(task));
|
||||
}
|
||||
|
||||
|
||||
@@ -153,7 +153,7 @@ struct server_task {
|
||||
|
||||
// used by SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE
|
||||
struct slot_action {
|
||||
int slot_id;
|
||||
int id_slot;
|
||||
std::string filename;
|
||||
std::string filepath;
|
||||
};
|
||||
|
||||
59
tools/server/webui/package-lock.json
generated
59
tools/server/webui/package-lock.json
generated
@@ -61,7 +61,7 @@
|
||||
"remark-math": "^6.0.0",
|
||||
"sass": "^1.93.3",
|
||||
"storybook": "^10.0.7",
|
||||
"svelte": "^5.0.0",
|
||||
"svelte": "^5.38.2",
|
||||
"svelte-check": "^4.0.0",
|
||||
"tailwind-merge": "^3.3.1",
|
||||
"tailwind-variants": "^3.2.2",
|
||||
@@ -88,6 +88,7 @@
|
||||
"version": "2.3.0",
|
||||
"resolved": "https://registry.npmjs.org/@ampproject/remapping/-/remapping-2.3.0.tgz",
|
||||
"integrity": "sha512-30iZtAPgz+LTIYoeivqYo853f02jBYSd5uGnGpkFV0M3xOt9aN73erkgYAmZU43x4VfqcnLxW9Kpg3R5LC4YYw==",
|
||||
"dev": true,
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"@jridgewell/gen-mapping": "^0.3.5",
|
||||
@@ -867,6 +868,7 @@
|
||||
"integrity": "sha512-oJrXtQiAXLvT9clCf1K4kxp3eKsQhIaZqxEyowkBcsvZDdZkbWrVmnGknxs5flTD0VGsxrxKgBCZty1EzoiMzA==",
|
||||
"dev": true,
|
||||
"license": "Apache-2.0",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@swc/helpers": "^0.5.0"
|
||||
}
|
||||
@@ -898,7 +900,6 @@
|
||||
"version": "2.3.5",
|
||||
"resolved": "https://registry.npmjs.org/@jridgewell/remapping/-/remapping-2.3.5.tgz",
|
||||
"integrity": "sha512-LI9u/+laYG4Ds1TDKSJW2YPrIlcVYOwi2fUC6xB43lueCjgxV4lffOCZCtYFiH6TNOX+tQKXx97T4IKHbhyHEQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@jridgewell/gen-mapping": "^0.3.5",
|
||||
@@ -2031,6 +2032,7 @@
|
||||
"integrity": "sha512-rO+YQhHucy47Vh67z318pALmd6x+K1Kj30Fb4a6oOEw4xn4zCo9KTmkMWs24c4oduEXD/eJu3badlRmsVXzyfA==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"ts-dedent": "^2.0.0",
|
||||
"type-fest": "~2.19"
|
||||
@@ -2114,6 +2116,7 @@
|
||||
"integrity": "sha512-Vp3zX/qlwerQmHMP6x0Ry1oY7eKKRcOWGc2P59srOp4zcqyn+etJyQpELgOi4+ZSUgteX8Y387NuwruLgGXLUQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@standard-schema/spec": "^1.0.0",
|
||||
"@sveltejs/acorn-typescript": "^1.0.5",
|
||||
@@ -2153,6 +2156,7 @@
|
||||
"integrity": "sha512-YZs/OSKOQAQCnJvM/P+F1URotNnYNeU3P2s4oIpzm1uFaqUEqRxUB0g5ejMjEb5Gjb9/PiBI5Ktrq4rUUF8UVQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@sveltejs/vite-plugin-svelte-inspector": "^5.0.0",
|
||||
"debug": "^4.4.1",
|
||||
@@ -2568,6 +2572,7 @@
|
||||
"integrity": "sha512-pemlzrSESWbdAloYml3bAJMEfNh1Z7EduzqPKprCH5S341frlpYnUEW0H72dLxa6IsYr+mPno20GiSm+h9dEdQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@babel/code-frame": "^7.10.4",
|
||||
"@babel/runtime": "^7.12.5",
|
||||
@@ -2735,6 +2740,7 @@
|
||||
"integrity": "sha512-bJFoMATwIGaxxx8VJPeM8TonI8t579oRvgAuT8zFugJsJZgzqv0Fu8Mhp68iecjzG7cnN3mO2dJQ5uUM2EFrgQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"undici-types": "~6.21.0"
|
||||
}
|
||||
@@ -2802,6 +2808,7 @@
|
||||
"integrity": "sha512-kVIaQE9vrN9RLCQMQ3iyRlVJpTiDUY6woHGb30JDkfJErqrQEmtdWH3gV0PBAfGZgQXoqzXOO0T3K6ioApbbAA==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@typescript-eslint/scope-manager": "8.37.0",
|
||||
"@typescript-eslint/types": "8.37.0",
|
||||
@@ -3026,6 +3033,7 @@
|
||||
"integrity": "sha512-tJxiPrWmzH8a+w9nLKlQMzAKX/7VjFs50MWgcAj7p9XQ7AQ9/35fByFYptgPELyLw+0aixTnC4pUWV+APcZ/kw==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@testing-library/dom": "^10.4.0",
|
||||
"@testing-library/user-event": "^14.6.1",
|
||||
@@ -3129,6 +3137,7 @@
|
||||
"integrity": "sha512-oukfKT9Mk41LreEW09vt45f8wx7DordoWUZMYdY/cyAk7w5TWkTRCNZYF7sX7n2wB7jyGAl74OxgwhPgKaqDMQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@vitest/utils": "3.2.4",
|
||||
"pathe": "^2.0.3",
|
||||
@@ -3186,6 +3195,7 @@
|
||||
"resolved": "https://registry.npmjs.org/acorn/-/acorn-8.15.0.tgz",
|
||||
"integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==",
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"bin": {
|
||||
"acorn": "bin/acorn"
|
||||
},
|
||||
@@ -3738,8 +3748,7 @@
|
||||
"resolved": "https://registry.npmjs.org/csstype/-/csstype-3.1.3.tgz",
|
||||
"integrity": "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/debug": {
|
||||
"version": "4.4.1",
|
||||
@@ -3840,10 +3849,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/devalue": {
|
||||
"version": "5.3.2",
|
||||
"resolved": "https://registry.npmjs.org/devalue/-/devalue-5.3.2.tgz",
|
||||
"integrity": "sha512-UDsjUbpQn9kvm68slnrs+mfxwFkIflOhkanmyabZ8zOYk8SMEIbJ3TK+88g70hSIeytu4y18f0z/hYHMTrXIWw==",
|
||||
"dev": true,
|
||||
"version": "5.6.2",
|
||||
"resolved": "https://registry.npmjs.org/devalue/-/devalue-5.6.2.tgz",
|
||||
"integrity": "sha512-nPRkjWzzDQlsejL1WVifk5rvcFi/y1onBRxjaFMjZeR9mFpqu2gmAZ9xUB9/IEanEP/vBtGeGganC/GO1fmufg==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/devlop": {
|
||||
@@ -3973,6 +3981,7 @@
|
||||
"dev": true,
|
||||
"hasInstallScript": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"bin": {
|
||||
"esbuild": "bin/esbuild"
|
||||
},
|
||||
@@ -4027,6 +4036,7 @@
|
||||
"integrity": "sha512-QldCVh/ztyKJJZLr4jXNUByx3gR+TDYZCRXEktiZoUR3PGy4qCmSbkxcIle8GEwGpb5JBZazlaJ/CxLidXdEbQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@eslint-community/eslint-utils": "^4.2.0",
|
||||
"@eslint-community/regexpp": "^4.12.1",
|
||||
@@ -6939,6 +6949,7 @@
|
||||
}
|
||||
],
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"nanoid": "^3.3.11",
|
||||
"picocolors": "^1.1.1",
|
||||
@@ -7072,6 +7083,7 @@
|
||||
"integrity": "sha512-I7AIg5boAr5R0FFtJ6rCfD+LFsWHp81dolrFD8S79U9tb8Az2nGrJncnMSnys+bpQJfRUzqs9hnA81OAA3hCuQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"bin": {
|
||||
"prettier": "bin/prettier.cjs"
|
||||
},
|
||||
@@ -7088,6 +7100,7 @@
|
||||
"integrity": "sha512-pn1ra/0mPObzqoIQn/vUTR3ZZI6UuZ0sHqMK5x2jMLGrs53h0sXhkVuDcrlssHwIMk7FYrMjHBPoUSyyEEDlBQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"peerDependencies": {
|
||||
"prettier": "^3.0.0",
|
||||
"svelte": "^3.2.0 || ^4.0.0-next.0 || ^5.0.0-next.0"
|
||||
@@ -7312,6 +7325,7 @@
|
||||
"integrity": "sha512-FS+XFBNvn3GTAWq26joslQgWNoFu08F4kl0J4CgdNKADkdSGXQyTCnKteIAJy96Br6YbpEU1LSzV5dYtjMkMDg==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"engines": {
|
||||
"node": ">=0.10.0"
|
||||
}
|
||||
@@ -7322,6 +7336,7 @@
|
||||
"integrity": "sha512-Xs1hdnE+DyKgeHJeJznQmYMIBG3TKIHJJT95Q58nHLSrElKlGQqDTR2HQ9fx5CN/Gk6Vh/kupBTDLU11/nDk/g==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"scheduler": "^0.26.0"
|
||||
},
|
||||
@@ -7598,6 +7613,7 @@
|
||||
"integrity": "sha512-4iya7Jb76fVpQyLoiVpzUrsjQ12r3dM7fIVz+4NwoYvZOShknRmiv+iu9CClZml5ZLGb0XMcYLutK6w9tgxHDw==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@types/estree": "1.0.8"
|
||||
},
|
||||
@@ -7704,6 +7720,7 @@
|
||||
"integrity": "sha512-elOcIZRTM76dvxNAjqYrucTSI0teAF/L2Lv0s6f6b7FOwcwIuA357bIE871580AjHJuSvLIRUosgV+lIWx6Rgg==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"chokidar": "^4.0.0",
|
||||
"immutable": "^5.0.2",
|
||||
@@ -7938,6 +7955,7 @@
|
||||
"integrity": "sha512-7smAu0o+kdm378Q2uIddk32pn0UdIbrtTVU+rXRVtTVTCrK/P2cCui2y4JH+Bl3NgEq1bbBQpCAF/HKrDjk2Qw==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@storybook/global": "^5.0.0",
|
||||
"@storybook/icons": "^1.6.0",
|
||||
@@ -8079,12 +8097,13 @@
|
||||
}
|
||||
},
|
||||
"node_modules/svelte": {
|
||||
"version": "5.36.12",
|
||||
"resolved": "https://registry.npmjs.org/svelte/-/svelte-5.36.12.tgz",
|
||||
"integrity": "sha512-c3mWT+b0yBLl3gPGSHiy4pdSQCsPNTjLC0tVoOhrGJ6PPfCzD/RQpAmAfJtQZ304CAae2ph+L3C4aqds3R3seQ==",
|
||||
"version": "5.48.3",
|
||||
"resolved": "https://registry.npmjs.org/svelte/-/svelte-5.48.3.tgz",
|
||||
"integrity": "sha512-w7QZ398cdNherTdiQ/v3SYLLGOO4948Jgjh04PYqtTYVohmBvbmFwLmo7pp8gp4/1tceRWfSTjHgjtfpCVNJmQ==",
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@ampproject/remapping": "^2.3.0",
|
||||
"@jridgewell/remapping": "^2.3.4",
|
||||
"@jridgewell/sourcemap-codec": "^1.5.0",
|
||||
"@sveltejs/acorn-typescript": "^1.0.5",
|
||||
"@types/estree": "^1.0.5",
|
||||
@@ -8092,8 +8111,9 @@
|
||||
"aria-query": "^5.3.1",
|
||||
"axobject-query": "^4.1.0",
|
||||
"clsx": "^2.1.1",
|
||||
"devalue": "^5.6.2",
|
||||
"esm-env": "^1.2.1",
|
||||
"esrap": "^2.1.0",
|
||||
"esrap": "^2.2.1",
|
||||
"is-reference": "^3.0.3",
|
||||
"locate-character": "^3.0.0",
|
||||
"magic-string": "^0.30.11",
|
||||
@@ -8281,9 +8301,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/svelte/node_modules/esrap": {
|
||||
"version": "2.1.0",
|
||||
"resolved": "https://registry.npmjs.org/esrap/-/esrap-2.1.0.tgz",
|
||||
"integrity": "sha512-yzmPNpl7TBbMRC5Lj2JlJZNPml0tzqoqP5B1JXycNUwtqma9AKCO0M2wHrdgsHcy1WRW7S9rJknAMtByg3usgA==",
|
||||
"version": "2.2.2",
|
||||
"resolved": "https://registry.npmjs.org/esrap/-/esrap-2.2.2.tgz",
|
||||
"integrity": "sha512-zA6497ha+qKvoWIK+WM9NAh5ni17sKZKhbS5B3PoYbBvaYHZWoS33zmFybmyqpn07RLUxSmn+RCls2/XF+d0oQ==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@jridgewell/sourcemap-codec": "^1.4.15"
|
||||
@@ -8326,6 +8346,7 @@
|
||||
"integrity": "sha512-gBXpgUm/3rp1lMZZrM/w7D8GKqshif0zAymAhbCyIt8KMe+0v9DQ7cdYLR4FHH/cKpdTXb+A/tKKU3eolfsI+g==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"funding": {
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/dcastil"
|
||||
@@ -8356,7 +8377,8 @@
|
||||
"resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.11.tgz",
|
||||
"integrity": "sha512-2E9TBm6MDD/xKYe+dvJZAmg3yxIEDNRc0jwlNyDg/4Fil2QcSLjFKGVff0lAf1jjeaArlG/M75Ey/EYr/OJtBA==",
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
"license": "MIT",
|
||||
"peer": true
|
||||
},
|
||||
"node_modules/tapable": {
|
||||
"version": "2.2.2",
|
||||
@@ -8569,6 +8591,7 @@
|
||||
"integrity": "sha512-p1diW6TqL9L07nNxvRMM7hMMw4c5XOo/1ibL4aAIGmSAt9slTE1Xgw5KWuof2uTOvCg9BY7ZRi+GaF+7sfgPeQ==",
|
||||
"dev": true,
|
||||
"license": "Apache-2.0",
|
||||
"peer": true,
|
||||
"bin": {
|
||||
"tsc": "bin/tsc",
|
||||
"tsserver": "bin/tsserver"
|
||||
@@ -8934,6 +8957,7 @@
|
||||
"integrity": "sha512-BxAKBWmIbrDgrokdGZH1IgkIk/5mMHDreLDmCJ0qpyJaAteP8NvMhkwr/ZCQNqNH97bw/dANTE9PDzqwJghfMQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"esbuild": "^0.25.0",
|
||||
"fdir": "^6.5.0",
|
||||
@@ -9094,6 +9118,7 @@
|
||||
"integrity": "sha512-LUCP5ev3GURDysTWiP47wRRUpLKMOfPh+yKTx3kVIEiu5KOMeqzpnYNsKyOoVrULivR8tLcks4+lga33Whn90A==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@types/chai": "^5.2.2",
|
||||
"@vitest/expect": "3.2.4",
|
||||
|
||||
@@ -62,7 +62,7 @@
|
||||
"remark-math": "^6.0.0",
|
||||
"sass": "^1.93.3",
|
||||
"storybook": "^10.0.7",
|
||||
"svelte": "^5.0.0",
|
||||
"svelte": "^5.38.2",
|
||||
"svelte-check": "^4.0.0",
|
||||
"tailwind-merge": "^3.3.1",
|
||||
"tailwind-variants": "^3.2.2",
|
||||
|
||||
Reference in New Issue
Block a user