Compare commits

..

9 Commits

Author SHA1 Message Date
0cc4m
05429433a1 examples: add model-backend-compare tool to compare intermediate device tensors with CPU reference 2025-11-25 18:05:56 +01:00
TianHao324
064c90d843 CANN: supports out_prod operator for F32 and F16 (#17406)
Co-authored-by: tianhao <tianhao42@huawei.com>
2025-11-25 17:39:06 +08:00
Pascal
b1846f1c8e webui: add rehype plugin to restore HTML in Markdown table cells (#17477)
* webui: add rehype plugin to restore HTML in Markdown table cells

The remark/rehype pipeline neutralizes inline HTML as literal text
(remarkLiteralHtml) so that XML/HTML snippets in LLM responses display
as-is instead of being rendered. This causes <br> and <ul> markup in
table cells to show as plain text.

This plugin traverses the HAST post-conversion, parses whitelisted HTML
patterns (<br>, <ul><li>) from text nodes, and replaces them with actual
HAST element nodes. For lists, adjacent siblings must be combined first
as the AST fragmentation breaks pattern matching.

Strict validation rejects malformed markup, keeping it as raw text.

* chore: update webui build output
2025-11-25 08:01:02 +01:00
Jeff Bolz
d414db02d3 vulkan: Use fewer rows for scalar FA when HS is not a multiple of 16 (#17455) 2025-11-25 07:11:27 +01:00
Aaron Teo
877566d512 llama: introduce support for model-embedded sampling parameters (#17120) 2025-11-25 09:56:07 +08:00
Jeff Bolz
3d07caa99b vulkan: more FA details in vk_perf_logger (#17443) 2025-11-24 22:25:24 +01:00
Daniel Bevenius
134e6940ca llama : skip output reordering for single token batches (#17466)
This commit adds a check to skip the output reordering logic when
n_outputs == 1. With a single output token, the data is trivially
sorted and the reordering code is currently doing unnecessary work
(resetting and rebuilding output_ids to the same values).

The motivation for this change is improved code clarity and avoiding
confusion when debugging. While the performance impact is probably
negligible, this unnecessary work happens on every decode call in
llama-server when processing batches with single-token outputs.
2025-11-24 21:06:17 +01:00
Jiacheng (Jason) Chen
0543f928a3 HIP: WMMA-MMQ kernels for RDNA 4 (#17156)
* first commit naive test to enable mmq for RDNA4

* adding appropriate WMMA instructions

* git rebase on top of master: fixing the correctness of the mat mul operations, updating layout mappings for RDNA4

* clean up merge conflicts

* add comments and code clean up

* PR clean up, addressed comments

* enable MMQ fallback on RDNA4

* addressed comments: add guards in load generic, separate wmma branch for use_mmq function

* Revert build-xcframework.sh

* Formating: remove trailing whitespace

* revert CMake files

* clean up after rebase: remove duplicated change, revert cmake files

* clean up after rebase: revert changes from build-xcframework.sh

* clean up: remove extra space line in mma.cuh

* Revert "clean up: remove extra space line in mma.cuh"

This reverts commit b39ed57c45.
2025-11-24 20:00:10 +01:00
Sigbjørn Skjæret
b61de2b2df convert : allow quantizing lora again (#17453) 2025-11-24 15:50:55 +01:00
28 changed files with 1428 additions and 189 deletions

View File

@@ -1232,6 +1232,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
const auto sampler_names = string_split<std::string>(value, ';');
params.sampling.samplers = common_sampler_types_from_names(sampler_names, true);
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS;
}
).set_sparam());
add_opt(common_arg(
@@ -1261,6 +1262,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.sampling.temp = std::stof(value);
params.sampling.temp = std::max(params.sampling.temp, 0.0f);
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TEMP;
}
).set_sparam());
add_opt(common_arg(
@@ -1268,6 +1270,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
string_format("top-k sampling (default: %d, 0 = disabled)", params.sampling.top_k),
[](common_params & params, int value) {
params.sampling.top_k = value;
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K;
}
).set_sparam());
add_opt(common_arg(
@@ -1275,6 +1278,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
[](common_params & params, const std::string & value) {
params.sampling.top_p = std::stof(value);
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_P;
}
).set_sparam());
add_opt(common_arg(
@@ -1282,6 +1286,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sampling.min_p),
[](common_params & params, const std::string & value) {
params.sampling.min_p = std::stof(value);
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIN_P;
}
).set_sparam());
add_opt(common_arg(
@@ -1296,6 +1301,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
[](common_params & params, const std::string & value) {
params.sampling.xtc_probability = std::stof(value);
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY;
}
).set_sparam());
add_opt(common_arg(
@@ -1303,6 +1309,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sampling.xtc_threshold),
[](common_params & params, const std::string & value) {
params.sampling.xtc_threshold = std::stof(value);
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD;
}
).set_sparam());
add_opt(common_arg(
@@ -1321,6 +1328,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
params.sampling.penalty_last_n = value;
params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n);
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N;
}
).set_sparam());
add_opt(common_arg(
@@ -1328,6 +1336,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sampling.penalty_repeat),
[](common_params & params, const std::string & value) {
params.sampling.penalty_repeat = std::stof(value);
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT;
}
).set_sparam());
add_opt(common_arg(
@@ -1425,6 +1434,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sampling.mirostat),
[](common_params & params, int value) {
params.sampling.mirostat = value;
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT;
}
).set_sparam());
add_opt(common_arg(
@@ -1432,6 +1442,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sampling.mirostat_eta),
[](common_params & params, const std::string & value) {
params.sampling.mirostat_eta = std::stof(value);
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA;
}
).set_sparam());
add_opt(common_arg(
@@ -1439,6 +1450,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sampling.mirostat_tau),
[](common_params & params, const std::string & value) {
params.sampling.mirostat_tau = std::stof(value);
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU;
}
).set_sparam());
add_opt(common_arg(

View File

@@ -8,6 +8,7 @@
#include "common.h"
#include "log.h"
#include "llama.h"
#include "sampling.h"
#include <algorithm>
#include <cinttypes>
@@ -949,6 +950,58 @@ std::vector<common_file_info> fs_list_files(const std::string & path) {
// Model utils
//
static inline void common_init_sampler_from_model(
const llama_model * model,
common_params_sampling & sparams) {
const uint64_t config = sparams.user_sampling_config;
auto get_int32 = [&](const char * key, int32_t & dst, uint64_t user_config) {
if (config & user_config) return;
char buf[64] = {0};
if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
char * end = nullptr;
int32_t v = strtol(buf, &end, 10);
if (end && end != buf) dst = v;
}
};
auto get_float = [&](const char * key, float & dst, uint64_t user_config) {
if (config & user_config) return;
char buf[128] = {0};
if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
char * end = nullptr;
float v = strtof(buf, &end);
if (end && end != buf) dst = v;
}
};
// Sampling sequence
if (!(config & common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS)) {
char buf[512] = {0};
if (llama_model_meta_val_str(model, llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE), buf, sizeof(buf)) > 0) {
const std::vector<std::string> sampler_names = string_split<std::string>(std::string(buf), ';');
if (!sampler_names.empty()) {
sparams.samplers = common_sampler_types_from_names(sampler_names, true);
}
}
}
get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TOP_K), sparams.top_k, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K);
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TOP_P), sparams.top_p, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_P);
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIN_P), sparams.min_p, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIN_P);
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY), sparams.xtc_probability, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY);
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD), sparams.xtc_threshold, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD);
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TEMP), sparams.temp, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TEMP);
get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N), sparams.penalty_last_n, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N);
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT), sparams.penalty_repeat, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT);
get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT), sparams.mirostat, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT);
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU), sparams.mirostat_tau, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU);
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA), sparams.mirostat_eta, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA);
}
struct common_init_result common_init_from_params(common_params & params) {
common_init_result iparams;
auto mparams = common_model_params_to_llama(params);
@@ -960,6 +1013,8 @@ struct common_init_result common_init_from_params(common_params & params) {
return iparams;
}
common_init_sampler_from_model(model, params.sampling);
const llama_vocab * vocab = llama_model_get_vocab(model);
auto cparams = common_context_params_to_llama(params);

View File

@@ -140,6 +140,22 @@ struct common_grammar_trigger {
llama_token token = LLAMA_TOKEN_NULL;
};
enum common_params_sampling_config : uint64_t {
COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS = 1 << 0,
COMMON_PARAMS_SAMPLING_CONFIG_TOP_K = 1 << 1,
COMMON_PARAMS_SAMPLING_CONFIG_TOP_P = 1 << 2,
COMMON_PARAMS_SAMPLING_CONFIG_MIN_P = 1 << 3,
COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY = 1 << 4,
COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD = 1 << 5,
COMMON_PARAMS_SAMPLING_CONFIG_TEMP = 1 << 6,
COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N = 1 << 7,
COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT = 1 << 8,
COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT = 1 << 9,
COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU = 1 << 10,
COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA = 1 << 11,
};
// sampling parameters
struct common_params_sampling {
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
@@ -172,6 +188,8 @@ struct common_params_sampling {
bool no_perf = false; // disable performance metrics
bool timing_per_token = false;
uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers
std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY

View File

@@ -565,7 +565,7 @@ class ModelBase:
gguf.MODEL_TENSOR.ALTUP_PREDICT_COEF,
)
)
or not new_name.endswith(".weight")
or new_name[-7:] not in (".weight", ".lora_a", ".lora_b")
):
data_qtype = gguf.GGMLQuantizationType.F32

View File

@@ -242,7 +242,7 @@ def parse_args() -> argparse.Namespace:
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
)
parser.add_argument(
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f32",
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
)
parser.add_argument(

View File

@@ -17,6 +17,7 @@ else()
add_subdirectory(batched)
add_subdirectory(embedding)
add_subdirectory(eval-callback)
add_subdirectory(model-backend-compare)
add_subdirectory(gguf-hash)
add_subdirectory(gguf)

View File

@@ -0,0 +1,15 @@
set(TARGET llama-model-backend-compare)
add_executable(${TARGET} model-backend-compare.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
set(TEST_TARGET test-model-backend-compare)
if(NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
add_test(NAME ${TEST_TARGET}
COMMAND llama-model-backend-compare --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
else()
add_test(NAME ${TEST_TARGET}
COMMAND llama-model-backend-compare --hf-repo ggml-org/models --hf-file tinyllamas/stories260K-be.gguf --model stories260K-be.gguf --prompt hello --seed 42 -ngl 0)
endif()
set_property(TEST ${TEST_TARGET} PROPERTY LABELS model-backend-compare curl)

View File

@@ -0,0 +1,384 @@
#include "arg.h"
#include "common.h"
#include "log.h"
#include "common.h"
#include "llama.h"
#include "ggml.h"
#include <vector>
#include <cstdint>
#include <unordered_map>
#include <string>
#include <sstream>
#include <cstring>
#include <cmath>
namespace {
constexpr double nmse_threshold = 1e-2;
struct callback_data {
std::vector<uint8_t> data;
std::vector<float> device_results;
std::unordered_map<std::string, std::vector<float>> cpu_results;
};
bool gather = true;
// normalized mean squared error = mse(a, b) / mse(a, 0)
double nmse(const float * a, const float * b, size_t n) {
double mse_a_b = 0.0;
double mse_a_0 = 0.0;
for (size_t i = 0; i < n; i++) {
float a_i = a[i];
float b_i = b[i];
mse_a_b += (a_i - b_i) * (a_i - b_i);
mse_a_0 += a_i * a_i;
}
return mse_a_b / mse_a_0;
}
void ggml_print_tensor(const ggml_tensor * t, const std::vector<float> data, int64_t n) {
GGML_ASSERT(n > 0);
float sum = 0;
for (int64_t i3 = 0; i3 < t->ne[3]; i3++) {
for (int64_t i2 = 0; i2 < t->ne[2]; i2++) {
for (int64_t i1 = 0; i1 < t->ne[1]; i1++) {
for (int64_t i0 = 0; i0 < t->ne[0]; i0++) {
const float v = data[i3 * t->ne[2] * t->ne[1] * t->ne[0] + i2 * t->ne[1] * t->ne[0] + i1 * t->ne[0] + i0];
sum += v;
}
}
}
}
for (int64_t i3 = 0; i3 < t->ne[3]; i3++) {
LOG(" [\n");
for (int64_t i2 = 0; i2 < t->ne[2]; i2++) {
if (i2 == n && t->ne[2] > 2*n) {
LOG(" ..., \n");
i2 = t->ne[2] - n;
}
LOG(" [\n");
for (int64_t i1 = 0; i1 < t->ne[1]; i1++) {
if (i1 == n && t->ne[1] > 2*n) {
LOG(" ..., \n");
i1 = t->ne[1] - n;
}
LOG(" [");
for (int64_t i0 = 0; i0 < t->ne[0]; i0++) {
if (i0 == n && t->ne[0] > 2*n) {
LOG("..., ");
i0 = t->ne[0] - n;
}
const float v = data[i3 * t->ne[2] * t->ne[1] * t->ne[0] + i2 * t->ne[1] * t->ne[0] + i1 * t->ne[0] + i0];
LOG("%12.4f", v);
if (i0 < t->ne[0] - 1) LOG(", ");
}
LOG("],\n");
}
LOG(" ],\n");
}
LOG(" ]\n");
LOG(" sum = %f\n", sum);
}
if (std::isnan(sum)) {
LOG_ERR("encountered NaN - aborting\n");
exit(0);
}
}
inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
union {
float f;
uint32_t i;
} u;
u.i = (uint32_t)h.bits << 16;
return u.f;
}
float to_float(const uint8_t * ptr, ggml_type type) {
switch (type) {
case GGML_TYPE_F32:
return *(const float *)ptr;
case GGML_TYPE_F16:
return ggml_fp16_to_fp32(*(const ggml_fp16_t *)ptr);
case GGML_TYPE_BF16:
return ggml_compute_bf16_to_fp32(*(const ggml_bf16_t *)ptr);
case GGML_TYPE_I8:
return static_cast<float>(*(const int8_t *)ptr);
case GGML_TYPE_I16:
return static_cast<float>(*(const int16_t *)ptr);
case GGML_TYPE_I32:
return static_cast<float>(*(const int32_t *)ptr);
case GGML_TYPE_I64:
return static_cast<float>(*(const int64_t *)ptr);
default:
GGML_ABORT("unsupported ggml_type %d in to_float", type);
}
return 0.0f;
}
void tensor_to_float_array(const ggml_tensor * t, const void * data, std::vector<float> & out) {
const size_t n_elements = ggml_nelements(t);
out.resize(n_elements);
// convert to float
size_t idx = 0;
for (int64_t i3 = 0; i3 < t->ne[3]; i3++) {
for (int64_t i2 = 0; i2 < t->ne[2]; i2++) {
for (int64_t i1 = 0; i1 < t->ne[1]; i1++) {
if (!ggml_is_quantized(t->type)) {
for (int64_t i0 = 0; i0 < t->ne[0]; i0++) {
const uint8_t * ptr = ((const uint8_t *)data) + i3 * t->nb[3] + i2 * t->nb[2] + i1 * t->nb[1] + i0 * t->nb[0];
out[idx] = to_float(ptr, t->type);
idx++;
}
} else {
GGML_ABORT("quantized types are not supported in tensor_to_float_array");
}
}
}
}
}
bool tensor_is_empty(ggml_tensor * node) {
return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE;
}
std::string remove_device_from_name(const std::string & name) {
// Remove prefix and suffix
// Example: Vulkan0#inp_embd#0 -> inp_embd
size_t start = name.find_first_of('#');
size_t end = name.find_last_of('#');
if (start != std::string::npos && end != std::string::npos &&
end > start) {
return name.substr(start + 1, end - start - 1);
}
return name;
}
std::string tensor_name(ggml_tensor * t) {
const std::string tname(t->name, strnlen(t->name, GGML_MAX_NAME));
std::stringstream ss;
ss << tname << "[";
// Get last source
size_t last_src = 0;
for (size_t i = 0; i < GGML_MAX_SRC; i++) {
if (t->src[i]) {
last_src = i;
}
}
for (size_t i = 0; i < GGML_MAX_SRC; i++) {
if (t->src[i]) {
const std::string src_name(t->src[i]->name, strnlen(t->src[i]->name, GGML_MAX_NAME));
ss << remove_device_from_name(src_name);
if (i < last_src) {
ss << ", ";
}
}
}
ss << "]";
return ss.str();
}
bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
auto * cb_data = (callback_data *) user_data;
if (ask || tensor_is_empty(t)) {
return true; // Always retrieve data
}
const std::string name = tensor_name(t);
if (gather) {
// CPU data should be host-visible
GGML_ASSERT(ggml_backend_buffer_is_host(t->buffer));
// Make sure this tensor does not exist yet
if (cb_data->cpu_results.find(name) != cb_data->cpu_results.end())
{
LOG_ERR("%s : tensor '%s' already exists in CPU reference data\n", __func__, name.c_str());
GGML_ABORT("fatal error");
}
std::vector<float>& result = cb_data->cpu_results[name];
// LOG("gathering CPU reference data for tensor '%s'\n", name.c_str());
// for (size_t i = 0; i < GGML_MAX_DIMS; i++) {
// LOG(" ne[%zu] = %lld\n", i, t->ne[i]);
// }
// for (size_t i = 0; i < GGML_MAX_SRC; i++) {
// if (t->src[i]) {
// const std::string src_name(t->src[i]->name, strnlen(t->src[i]->name, GGML_MAX_NAME));
// LOG(" src[%zu] = %s\n", i, src_name.c_str());
// }
// }
tensor_to_float_array(t, t->data, result);
return true;
}
// Compare with CPU data if available
auto it = cb_data->cpu_results.find(name);
if (it == cb_data->cpu_results.end()) {
LOG_ERR("no CPU reference data for tensor '%s'\n", name.c_str());
return true;
}
const bool is_host = ggml_backend_buffer_is_host(t->buffer);
const size_t n_bytes = ggml_nbytes(t);
const uint8_t * data;
if (!is_host) {
if (cb_data->data.size() < n_bytes) {
cb_data->data.resize(n_bytes);
}
ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
data = cb_data->data.data();
} else {
data = (const uint8_t *) t->data;
}
tensor_to_float_array(t, data, cb_data->device_results);
const std::vector<float>& ref_data = it->second;
double error = nmse(ref_data.data(), cb_data->device_results.data(), ref_data.size());
if (error > nmse_threshold) {
LOG_ERR("nmse = %.12f tensor '%s' op=%s\n", error, name.c_str(), ggml_op_name(t->op));
LOG_ERR(" ne: ");
for (int i = 0; i < GGML_MAX_DIMS; i++) {
LOG_ERR("%ld ", t->ne[i]);
}
LOG_ERR("\n nb: ");
for (int i = 0; i < GGML_MAX_DIMS; i++) {
LOG_ERR("%zu ", t->nb[i]);
}
LOG_ERR("\n\n");
for (int i = 0; i < GGML_MAX_SRC; i++) {
if (t->src[i]) {
const std::string src_name(t->src[i]->name, strnlen(t->src[i]->name, GGML_MAX_NAME));
LOG_ERR(" src%d: %s\n", i, src_name.c_str());
}
}
LOG_ERR("CPU reference data for tensor '%s':\n", name.c_str());
ggml_print_tensor(t, ref_data, 2);
LOG_ERR("Device data for tensor '%s':\n", name.c_str());
ggml_print_tensor(t, cb_data->device_results, 2);
return false;
} else {
LOG("nmse = %.12f tensor '%s' op = %s\n", error, name.c_str(), ggml_op_name(t->op));
}
return true;
}
bool run(llama_context * ctx, const common_params & params) {
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
const bool add_bos = llama_vocab_get_add_bos(vocab);
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
if (tokens.empty()) {
LOG_ERR("%s : there are not input tokens to process - (try to provide a prompt with '-p')\n", __func__);
return false;
}
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
LOG_ERR("%s : failed to eval\n", __func__);
return false;
}
return true;
}
} // namespace
int main(int argc, char ** argv) {
callback_data cb_data;
common_params params;
params.prompt = "The quick brown fox";
params.sampling.seed = 1234;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
return 1;
}
common_init();
llama_backend_init();
// pass the callback to the backend scheduler
// it will be executed for each node during the graph computation
params.cb_eval = ggml_debug;
params.cb_eval_user_data = &cb_data;
params.warmup = false;
params.split_mode = LLAMA_SPLIT_MODE_NONE;
const size_t n_dev = ggml_backend_dev_count();
for (size_t i = 0; i < n_dev * 2; i++) {
ggml_backend_dev_t device = ggml_backend_dev_get(i % ggml_backend_dev_count());
// Run CPU-only first to gather reference results
if ((i < n_dev && ggml_backend_dev_type(device) != GGML_BACKEND_DEVICE_TYPE_CPU) ||
(i >= n_dev && ggml_backend_dev_type(device) == GGML_BACKEND_DEVICE_TYPE_CPU)) {
continue;
}
params.devices.clear();
params.devices.push_back(device);
if (i < n_dev) {
LOG_INF("=== Running on device %zu (gathering reference results) ===\n", i);
gather = true;
} else {
LOG_INF("=== Running on device %zu ===\n", i - n_dev);
gather = false;
}
// init
common_init_result llama_init = common_init_from_params(params);
llama_model * model = llama_init.model.get();
llama_context * ctx = llama_init.context.get();
if (model == nullptr || ctx == nullptr) {
LOG_ERR("%s : failed to init\n", __func__);
return 1;
}
// print system information
{
LOG_INF("\n");
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
LOG_INF("\n");
}
bool OK = run(ctx, params);
if (!OK) {
return 1;
}
LOG("\n");
llama_perf_context_print(ctx);
llama_backend_free();
}
return 0;
}

View File

@@ -42,6 +42,7 @@
#include <aclnnop/aclnn_exp.h>
#include <aclnnop/aclnn_fill_scalar.h>
#include <aclnnop/aclnn_fused_infer_attention_score_v2.h>
#include <aclnnop/aclnn_ger.h>
#include <aclnnop/aclnn_group_norm.h>
#include <aclnnop/aclnn_grouped_matmul_v3.h>
#include <aclnnop/aclnn_gt_scalar.h>
@@ -3236,3 +3237,64 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context & ctx, ggml_tensor * dst
GGML_ABORT("Function is not implemented.");
}
}
static void ggml_cann_out_prod_fp(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
ggml_tensor * src0 = dst->src[0]; // weight
ggml_tensor * src1 = dst->src[1]; // input
GGML_TENSOR_BINARY_OP_LOCALS
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, acl_dst.get());
const int64_t dps2 = ne2 / ne02;
const int64_t dps3 = ne3 / ne03;
for (int64_t i3 = 0; i3 < ne3; i3++) {
for (int64_t i2 = 0; i2 < ne2; i2++) {
const int64_t i02 = i2 / dps2;
const int64_t i03 = i3 / dps3;
const int64_t i12 = i2;
const int64_t i13 = i3;
acl_tensor_ptr accumulator =
ggml_cann_create_tensor((char *) dst->data + i2 * nb2 + i3 * nb3, ggml_cann_type_mapping(dst->type),
ggml_type_size(dst->type), dst->ne, dst->nb, 2);
// The outer product needs to be accumulated in this dimension.
for (int64_t i1 = 0; i1 < ne11; i1++) {
acl_tensor_ptr acl_input = ggml_cann_create_tensor(
(char *) src1->data + i1 * nb11 + i12 * nb12 + i13 * nb13, ggml_cann_type_mapping(src0->type),
ggml_type_size(src0->type), src1->ne, src1->nb, 1);
acl_tensor_ptr acl_weight = ggml_cann_create_tensor(
(char *) src0->data + i1 * nb01 + i02 * nb02 + i03 * nb03, ggml_cann_type_mapping(src0->type),
ggml_type_size(src0->type), src0->ne, src0->nb, 1);
ggml_cann_pool_alloc output_allocator(ctx.pool());
void * output_buffer = output_allocator.alloc(ggml_nbytes(dst));
acl_tensor_ptr acl_out = ggml_cann_create_tensor(output_buffer, ggml_cann_type_mapping(dst->type),
ggml_type_size(dst->type), dst->ne, dst->nb, 2);
GGML_CANN_CALL_ACLNN_OP(ctx, Ger, acl_input.get(), acl_weight.get(), acl_out.get());
float alpha_value = 1.0f;
aclScalar * alpha = aclCreateScalar(&alpha_value, ACL_FLOAT);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, accumulator.get(), acl_out.get(), alpha);
}
}
}
}
void ggml_cann_out_prod(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
ggml_tensor * src0 = dst->src[0];
const enum ggml_type type = src0->type;
switch (type) {
case GGML_TYPE_F32:
case GGML_TYPE_F16:
ggml_cann_out_prod_fp(ctx, dst);
break;
default:
GGML_ABORT("Unsupport type for GGML_OP_OUT_PROD");
break;
}
}

View File

@@ -1125,3 +1125,23 @@ void ggml_cann_op_unary_gated(std::function<void(ggml_backend_cann_context &, ac
} while (0)
#endif // CANN_ACLNN_OPS
/**
* @brief Performs outer product operation on two ggml tensors using the CANN backend.
*
* @details This function computes the outer product of two input tensors (src0 and src1)
* and stores the result in the destination tensor. The outer product operation is defined as:
* dst[i,j,k,l] = sum_m (src0[i,m,k,l] * src1[j,m,k,l])
*
* The function supports multiple data types including F32, F16. For floating-point
* types, it uses batch matrix multiplication for efficient computation.
*
* The implementation handles 4D tensor broadcasting and batch processing automatically.
*
* @param ctx The CANN backend context for operation execution and memory management.
* @param dst The destination ggml_tensor where the outer product result will be stored.
* The input tensors are assumed to be `dst->src[0]` and `dst->src[1]`.
*
* @see GGML_CANN_CALL_ACLNN_OP for CANN operator invocation
*/
void ggml_cann_out_prod(ggml_backend_cann_context & ctx, ggml_tensor * dst);

View File

@@ -1886,6 +1886,9 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context & ctx, struct gg
case GGML_OP_FLASH_ATTN_EXT:
ggml_cann_flash_attn_ext(ctx, dst);
break;
case GGML_OP_OUT_PROD:
ggml_cann_out_prod(ctx, dst);
break;
default:
return false;
}
@@ -2563,6 +2566,16 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
case GGML_OP_PAD_REFLECT_1D:
case GGML_OP_COUNT_EQUAL:
return true;
case GGML_OP_OUT_PROD:
{
switch (op->src[0]->type) {
case GGML_TYPE_F16:
case GGML_TYPE_F32:
return true;
default:
return false;
}
}
case GGML_OP_CONV_TRANSPOSE_1D:
// TODO: ((weightL - 1) * dilationW - padLeft)=1336 should not be larger than 255.
return (op->src[0]->ne[0] - 1) <= 255;

View File

@@ -73,34 +73,7 @@ namespace ggml_cuda_mma {
static constexpr int I = I_;
static constexpr int J = J_;
#if defined(GGML_USE_HIP)
#if defined(RDNA4)
static constexpr int ne = I * J / 32;
T x[ne] = {0};
static constexpr __device__ bool supported() {
if (I == 16 && J == 16) return true;
return false;
}
static __device__ __forceinline__ int get_i(const int l) {
if constexpr (I == 16 && J == 16) {
return 8 * (threadIdx.x / 16) + l;
} else {
NO_DEVICE_CODE;
return -1;
}
}
static __device__ __forceinline__ int get_j(const int l) {
if constexpr (I == 16 && J == 16) {
return threadIdx.x % 16;
} else {
NO_DEVICE_CODE;
return -1;
}
}
#else
#if defined(AMD_MFMA_AVAILABLE)
static constexpr int ne = I * J / 64;
T x[ne] = {0};
@@ -146,7 +119,6 @@ namespace ggml_cuda_mma {
return -1;
}
}
#endif // defined(RDNA4)
#elif __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
static constexpr int ne = I * J / 32;
T x[ne] = {0};
@@ -177,6 +149,34 @@ namespace ggml_cuda_mma {
return -1;
}
}
#elif defined(AMD_WMMA_AVAILABLE)
#if defined(RDNA4)
static constexpr int ne = I * J / 32;
T x[ne] = {0};
static constexpr __device__ bool supported() {
if (I == 16 && J == 16) return true;
return false;
}
static __device__ __forceinline__ int get_i(const int l) {
if constexpr (I == 16 && J == 16) {
return 8 * (threadIdx.x / 16) + l;
} else {
NO_DEVICE_CODE;
return -1;
}
}
static __device__ __forceinline__ int get_j(const int l) {
if constexpr (I == 16 && J == 16) {
return threadIdx.x % 16;
} else {
NO_DEVICE_CODE;
return -1;
}
}
#endif
#else
static constexpr int ne = I * J / 32;
T x[ne] = {0};
@@ -437,7 +437,20 @@ namespace ggml_cuda_mma {
xi[0] = xs[0];
}
#elif defined(AMD_WMMA_AVAILABLE)
ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
if constexpr (I == 16 && J == 4) {
int64_t * xi = (int64_t *) t.x;
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
xi[0] = xs[0];
}else if constexpr (I == 16 && J == 8) {
int64_t * xi = (int64_t *) t.x;
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I));
xi[0] = xs[0];
const int64_t * xs1 = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I) + 2);
xi[1] = xs1[0];
}else{
NO_DEVICE_CODE;
}
#else
#pragma unroll
for (int l = 0; l < t.ne; ++l) {
@@ -772,6 +785,36 @@ namespace ggml_cuda_mma {
acc[0],
0, 0, 0);
#endif // defined(CDNA3)
#elif defined(AMD_WMMA_AVAILABLE)
using int32x2_t = __attribute__((__vector_size__(2 * sizeof(int)))) int;
int32x2_t * a_vec = (int32x2_t *) A.x;
int32x2_t * b_vec = (int32x2_t *) B.x;
using int32x8_t = __attribute__((__vector_size__(8 * sizeof(int)))) int;
int32x8_t * acc = (int32x8_t *) D.x;
#if defined(RDNA4)
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
true,
a_vec[0],
true,
b_vec[0],
acc[0],
true
);
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
true,
a_vec[1],
true,
b_vec[1],
acc[0],
true
);
#endif // defined(RDNA4)
#else
GGML_UNUSED_VARS(D, A, B);
NO_DEVICE_CODE;
@@ -798,6 +841,7 @@ namespace ggml_cuda_mma {
acc[0],
0, 0, 0);
#endif // defined(CDNA3)
#else
GGML_UNUSED_VARS(D, A, B);
NO_DEVICE_CODE;
@@ -842,4 +886,31 @@ namespace ggml_cuda_mma {
mma(D16[1], A16[1], B);
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
}
static __device__ __forceinline__ void mma(
tile<16, 16, int> & D, const tile<16, 4, int> & A, const tile<16, 4, int> & B) {
#if defined(AMD_WMMA_AVAILABLE)
using int32x2_t = __attribute__((__vector_size__(2 * sizeof(int)))) int;
int32x2_t * a_vec = (int32x2_t *) A.x;
int32x2_t * b_vec = (int32x2_t *) B.x;
using int32x8_t = __attribute__((__vector_size__(8 * sizeof(int)))) int;
int32x8_t * acc = (int32x8_t *) D.x;
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
true,
a_vec[0],
true,
b_vec[0],
acc[0],
false
);
#else
GGML_UNUSED(D);
GGML_UNUSED(A);
GGML_UNUSED(B);
NO_DEVICE_CODE;
#endif
}
}

View File

@@ -306,5 +306,11 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
return false;
}
return (!GGML_CUDA_CC_IS_RDNA4(cc) && !GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
if (amd_wmma_available(cc)) {
if (GGML_CUDA_CC_IS_RDNA4(cc)) {
return true;
}
}
return (!GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
}

File diff suppressed because it is too large Load Diff

View File

@@ -1629,6 +1629,22 @@ class vk_perf_logger {
timings[name].push_back(time);
return;
}
if (node->op == GGML_OP_FLASH_ATTN_EXT) {
const ggml_tensor * dst = node;
const ggml_tensor * q = node->src[0];
const ggml_tensor * k = node->src[1];
const ggml_tensor * v = node->src[2];
const ggml_tensor * m = node->src[3];
std::stringstream name;
name << ggml_op_name(node->op) <<
" dst(" << dst->ne[0] << "," << dst->ne[1] << "," << dst->ne[2] << "," << dst->ne[3] << "), " <<
" q(" << q->ne[0] << "," << q->ne[1] << "," << q->ne[2] << "," << q->ne[3] << "), " <<
" k(" << k->ne[0] << "," << k->ne[1] << "," << k->ne[2] << "," << k->ne[3] << "), " <<
" v(" << v->ne[0] << "," << v->ne[1] << "," << v->ne[2] << "," << v->ne[3] << "), " <<
" m(" << (m?m->ne[0]:0) << "," << (m?m->ne[1]:0) << "," << (m?m->ne[2]:0) << "," << (m?m->ne[3]:0) << ")";
timings[name.str()].push_back(time);
return;
}
timings[ggml_op_name(node->op)].push_back(time);
}
private:
@@ -2485,9 +2501,11 @@ static void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events
static constexpr uint32_t flash_attention_num_small_rows = 32;
static constexpr uint32_t scalar_flash_attention_num_small_rows = 1;
static uint32_t get_fa_scalar_num_large_rows(uint32_t hsv) {
static uint32_t get_fa_scalar_num_large_rows(uint32_t hsk, uint32_t hsv) {
if (hsv >= 192) {
return 2;
} else if ((hsv | hsk) & 8) {
return 4;
} else {
return 8;
}
@@ -2519,9 +2537,9 @@ static std::array<uint32_t, 2> fa_rows_cols(FaCodePath path, uint32_t hsk, uint3
if ((hsv | hsk) & 8) {
// HSV/HSK not being a multiple of 16 makes D_split smaller, which makes cols_per_iter
// larger, and Bc needs to be >= cols_per_thread. 64 is large enough, 32 is not.
return {get_fa_scalar_num_large_rows(hsv), 64};
return {get_fa_scalar_num_large_rows(hsk, hsv), 64};
} else {
return {get_fa_scalar_num_large_rows(hsv), 32};
return {get_fa_scalar_num_large_rows(hsk, hsv), 32};
}
}
}
@@ -7724,7 +7742,7 @@ static bool ggml_vk_flash_attn_scalar_shmem_support(const vk_device& device, con
// Needs to be kept up to date on shader changes
GGML_UNUSED(hsv);
const uint32_t wg_size = scalar_flash_attention_workgroup_size;
const uint32_t Br = get_fa_scalar_num_large_rows(hsv);
const uint32_t Br = get_fa_scalar_num_large_rows(hsk, hsv);
const uint32_t Bc = scalar_flash_attention_Bc;
const uint32_t tmpsh = wg_size * sizeof(float);
@@ -7855,7 +7873,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
case FA_SCALAR:
case FA_COOPMAT1:
// We may switch from coopmat1 to scalar, so use the scalar limit for both
max_gqa = get_fa_scalar_num_large_rows(HSV);
max_gqa = get_fa_scalar_num_large_rows(HSK, HSV);
break;
case FA_COOPMAT2:
max_gqa = get_fa_num_small_rows(FA_COOPMAT2);

View File

@@ -25,6 +25,20 @@ class Keys:
ALIGNMENT = "general.alignment"
FILE_TYPE = "general.file_type"
# Recommended Sampler Parameters
SAMPLING_SEQUENCE = "general.sampling.sequence"
SAMPLING_TOP_K = "general.sampling.top_k"
SAMPLING_TOP_P = "general.sampling.top_p"
SAMPLING_MIN_P = "general.sampling.min_p"
SAMPLING_XTC_PROBABILITY = "general.sampling.xtc_probability"
SAMPLING_XTC_THRESHOLD = "general.sampling.xtc_threshold"
SAMPLING_TEMP = "general.sampling.temp"
SAMPLING_PENALTY_LAST_N = "general.sampling.penalty_last_n"
SAMPLING_PENALTY_REPEAT = "general.sampling.penalty_repeat"
SAMPLING_MIROSTAT = "general.sampling.mirostat"
SAMPLING_MIROSTAT_TAU = "general.sampling.mirostat_tau"
SAMPLING_MIROSTAT_ETA = "general.sampling.mirostat_eta"
# Authorship Metadata
NAME = "general.name"
AUTHOR = "general.author"

View File

@@ -496,6 +496,42 @@ class GGUFWriter:
def add_file_type(self, ftype: int) -> None:
self.add_uint32(Keys.General.FILE_TYPE, ftype)
def add_sampling_sequence(self, sequence: str) -> None:
self.add_string(Keys.General.SAMPLING_SEQUENCE, sequence)
def add_sampling_top_k(self, top_k: int) -> None:
self.add_int32(Keys.General.SAMPLING_TOP_K, top_k)
def add_sampling_top_p(self, top_p: float) -> None:
self.add_float32(Keys.General.SAMPLING_TOP_P, top_p)
def add_sampling_min_p(self, min_p: float) -> None:
self.add_float32(Keys.General.SAMPLING_MIN_P, min_p)
def add_sampling_xtc_probability(self, xtc_probability: float) -> None:
self.add_float32(Keys.General.SAMPLING_XTC_PROBABILITY, xtc_probability)
def add_sampling_xtc_threshold(self, xtc_threshold: float) -> None:
self.add_float32(Keys.General.SAMPLING_XTC_THRESHOLD, xtc_threshold)
def add_sampling_temp(self, temp: float) -> None:
self.add_float32(Keys.General.SAMPLING_TEMP, temp)
def add_sampling_penalty_last_n(self, penalty_last_n: int) -> None:
self.add_int32(Keys.General.SAMPLING_PENALTY_LAST_N, penalty_last_n)
def add_sampling_penalty_repeat(self, penalty_repeat: float) -> None:
self.add_float32(Keys.General.SAMPLING_PENALTY_REPEAT, penalty_repeat)
def add_sampling_mirostat(self, mirostat: int) -> None:
self.add_int32(Keys.General.SAMPLING_MIROSTAT, mirostat)
def add_sampling_mirostat_tau(self, mirostat_tau: float) -> None:
self.add_float32(Keys.General.SAMPLING_MIROSTAT_TAU, mirostat_tau)
def add_sampling_mirostat_eta(self, mirostat_eta: float) -> None:
self.add_float32(Keys.General.SAMPLING_MIROSTAT_ETA, mirostat_eta)
def add_name(self, name: str) -> None:
self.add_string(Keys.General.NAME, name)

View File

@@ -17,6 +17,20 @@ logger = logging.getLogger("metadata")
@dataclass
class Metadata:
# Recommended Sampler Parameters to be written to GGUF KV Store
sampling_sequence: Optional[str] = None
sampling_top_k: Optional[int] = None
sampling_top_p: Optional[float] = None
sampling_min_p: Optional[float] = None
sampling_xtc_probability: Optional[float] = None
sampling_xtc_threshold: Optional[float] = None
sampling_temp: Optional[float] = None
sampling_penalty_last_n: Optional[int] = None
sampling_penalty_repeat: Optional[float] = None
sampling_mirostat: Optional[int] = None
sampling_mirostat_tau: Optional[float] = None
sampling_mirostat_eta: Optional[float] = None
# Authorship Metadata to be written to GGUF KV Store
name: Optional[str] = None
author: Optional[str] = None
@@ -54,15 +68,43 @@ class Metadata:
model_card = Metadata.load_model_card(model_path)
hf_params = Metadata.load_hf_parameters(model_path)
gen_config = Metadata.load_generation_config(model_path)
# TODO: load adapter_config.json when possible, it usually contains the base model of the LoRA adapter
# heuristics
metadata = Metadata.apply_metadata_heuristic(metadata, model_card, hf_params, model_path, total_params)
if gen_config:
metadata.sampling_sequence = gen_config.get("sequence", metadata.sampling_sequence)
metadata.sampling_top_k = gen_config.get("top_k", metadata.sampling_top_k)
metadata.sampling_top_p = gen_config.get("top_p", metadata.sampling_top_p)
metadata.sampling_min_p = gen_config.get("min_p", metadata.sampling_min_p)
metadata.sampling_xtc_probability = gen_config.get("xtc_probability", metadata.sampling_xtc_probability)
metadata.sampling_xtc_threshold = gen_config.get("xtc_threshold", metadata.sampling_xtc_threshold)
metadata.sampling_temp = gen_config.get("temperature", metadata.sampling_temp)
metadata.sampling_penalty_last_n = gen_config.get("penalty_last_n", metadata.sampling_penalty_last_n)
metadata.sampling_penalty_repeat = gen_config.get("penalty_repeat", metadata.sampling_penalty_repeat)
metadata.sampling_mirostat = gen_config.get("mirostat", metadata.sampling_mirostat)
metadata.sampling_mirostat_tau = gen_config.get("mirostat_tau", metadata.sampling_mirostat_tau)
metadata.sampling_mirostat_eta = gen_config.get("mirostat_eta", metadata.sampling_mirostat_eta)
# Metadata Override File Provided
# This is based on LLM_KV_NAMES mapping in llama.cpp
metadata_override = Metadata.load_metadata_override(metadata_override_path)
metadata.sampling_sequence = metadata_override.get(Keys.General.SAMPLING_SEQUENCE, metadata.sampling_sequence)
metadata.sampling_top_k = metadata_override.get(Keys.General.SAMPLING_TOP_K, metadata.sampling_top_k)
metadata.sampling_top_p = metadata_override.get(Keys.General.SAMPLING_TOP_P, metadata.sampling_top_p)
metadata.sampling_min_p = metadata_override.get(Keys.General.SAMPLING_MIN_P, metadata.sampling_min_p)
metadata.sampling_xtc_probability = metadata_override.get(Keys.General.SAMPLING_XTC_PROBABILITY, metadata.sampling_xtc_probability)
metadata.sampling_xtc_threshold = metadata_override.get(Keys.General.SAMPLING_XTC_THRESHOLD, metadata.sampling_xtc_threshold)
metadata.sampling_temp = metadata_override.get(Keys.General.SAMPLING_TEMP, metadata.sampling_temp)
metadata.sampling_penalty_last_n = metadata_override.get(Keys.General.SAMPLING_PENALTY_LAST_N, metadata.sampling_penalty_last_n)
metadata.sampling_penalty_repeat = metadata_override.get(Keys.General.SAMPLING_PENALTY_REPEAT, metadata.sampling_penalty_repeat)
metadata.sampling_mirostat = metadata_override.get(Keys.General.SAMPLING_MIROSTAT, metadata.sampling_mirostat)
metadata.sampling_mirostat_tau = metadata_override.get(Keys.General.SAMPLING_MIROSTAT_TAU, metadata.sampling_mirostat_tau)
metadata.sampling_mirostat_eta = metadata_override.get(Keys.General.SAMPLING_MIROSTAT_ETA, metadata.sampling_mirostat_eta)
metadata.name = metadata_override.get(Keys.General.NAME, metadata.name)
metadata.author = metadata_override.get(Keys.General.AUTHOR, metadata.author)
metadata.version = metadata_override.get(Keys.General.VERSION, metadata.version)
@@ -172,6 +214,23 @@ class Metadata:
with open(config_path, "r", encoding="utf-8") as f:
return json.load(f)
@staticmethod
def load_generation_config(model_path: Optional[Path] = None) -> dict[str, Any]:
if model_path is None or not model_path.is_dir():
return {}
generation_config_path = model_path / "generation_config.json"
if not generation_config_path.is_file():
return {}
try:
with open(generation_config_path, "r", encoding="utf-8") as f:
return json.load(f)
except (json.JSONDecodeError, IOError):
# not all models have valid generation_config.json
return {}
@staticmethod
def id_to_title(string):
# Convert capitalization into title form unless acronym or version number
@@ -546,6 +605,32 @@ class Metadata:
def set_gguf_meta_model(self, gguf_writer: gguf.GGUFWriter):
assert self.name is not None
if self.sampling_sequence is not None:
gguf_writer.add_sampling_sequence(self.sampling_sequence)
if self.sampling_top_k is not None:
gguf_writer.add_sampling_top_k(self.sampling_top_k)
if self.sampling_top_p is not None:
gguf_writer.add_sampling_top_p(self.sampling_top_p)
if self.sampling_min_p is not None:
gguf_writer.add_sampling_min_p(self.sampling_min_p)
if self.sampling_xtc_probability is not None:
gguf_writer.add_sampling_xtc_probability(self.sampling_xtc_probability)
if self.sampling_xtc_threshold is not None:
gguf_writer.add_sampling_xtc_threshold(self.sampling_xtc_threshold)
if self.sampling_temp is not None:
gguf_writer.add_sampling_temp(self.sampling_temp)
if self.sampling_penalty_last_n is not None:
gguf_writer.add_sampling_penalty_last_n(self.sampling_penalty_last_n)
if self.sampling_penalty_repeat is not None:
gguf_writer.add_sampling_penalty_repeat(self.sampling_penalty_repeat)
if self.sampling_mirostat is not None:
gguf_writer.add_sampling_mirostat(self.sampling_mirostat)
if self.sampling_mirostat_tau is not None:
gguf_writer.add_sampling_mirostat_tau(self.sampling_mirostat_tau)
if self.sampling_mirostat_eta is not None:
gguf_writer.add_sampling_mirostat_eta(self.sampling_mirostat_eta)
gguf_writer.add_name(self.name)
if self.author is not None:

View File

@@ -246,6 +246,21 @@ extern "C" {
LLAMA_KV_OVERRIDE_TYPE_STR,
};
enum llama_model_meta_key {
LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE,
LLAMA_MODEL_META_KEY_SAMPLING_TOP_K,
LLAMA_MODEL_META_KEY_SAMPLING_TOP_P,
LLAMA_MODEL_META_KEY_SAMPLING_MIN_P,
LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY,
LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD,
LLAMA_MODEL_META_KEY_SAMPLING_TEMP,
LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N,
LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT,
LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT,
LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU,
LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA,
};
struct llama_model_kv_override {
enum llama_model_kv_override_type tag;
@@ -518,6 +533,9 @@ extern "C" {
// Get the number of metadata key/value pairs
LLAMA_API int32_t llama_model_meta_count(const struct llama_model * model);
// Get sampling metadata key name. Returns nullptr if the key is invalid
LLAMA_API const char * llama_model_meta_key_str(enum llama_model_meta_key key);
// Get metadata key name by index
LLAMA_API int32_t llama_model_meta_key_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);

View File

@@ -114,19 +114,31 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
};
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_GENERAL_TYPE, "general.type" },
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
{ LLM_KV_GENERAL_FILE_TYPE, "general.file_type" },
{ LLM_KV_GENERAL_NAME, "general.name" },
{ LLM_KV_GENERAL_AUTHOR, "general.author" },
{ LLM_KV_GENERAL_VERSION, "general.version" },
{ LLM_KV_GENERAL_URL, "general.url" },
{ LLM_KV_GENERAL_DESCRIPTION, "general.description" },
{ LLM_KV_GENERAL_LICENSE, "general.license" },
{ LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
{ LLM_KV_GENERAL_TYPE, "general.type" },
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
{ LLM_KV_GENERAL_FILE_TYPE, "general.file_type" },
{ LLM_KV_GENERAL_SAMPLING_SEQUENCE, "general.sampling.sequence" },
{ LLM_KV_GENERAL_SAMPLING_TOP_K, "general.sampling.top_k" },
{ LLM_KV_GENERAL_SAMPLING_TOP_P, "general.sampling.top_p" },
{ LLM_KV_GENERAL_SAMPLING_MIN_P, "general.sampling.min_p" },
{ LLM_KV_GENERAL_SAMPLING_XTC_PROBABILITY, "general.sampling.xtc_probability" },
{ LLM_KV_GENERAL_SAMPLING_XTC_THRESHOLD, "general.sampling.xtc_threshold" },
{ LLM_KV_GENERAL_SAMPLING_TEMP, "general.sampling.temp" },
{ LLM_KV_GENERAL_SAMPLING_PENALTY_LAST_N, "general.sampling.penalty_last_n" },
{ LLM_KV_GENERAL_SAMPLING_PENALTY_REPEAT, "general.sampling.penalty_repeat" },
{ LLM_KV_GENERAL_SAMPLING_MIROSTAT, "general.sampling.mirostat" },
{ LLM_KV_GENERAL_SAMPLING_MIROSTAT_TAU, "general.sampling.mirostat_tau" },
{ LLM_KV_GENERAL_SAMPLING_MIROSTAT_ETA, "general.sampling.mirostat_eta" },
{ LLM_KV_GENERAL_NAME, "general.name" },
{ LLM_KV_GENERAL_AUTHOR, "general.author" },
{ LLM_KV_GENERAL_VERSION, "general.version" },
{ LLM_KV_GENERAL_URL, "general.url" },
{ LLM_KV_GENERAL_DESCRIPTION, "general.description" },
{ LLM_KV_GENERAL_LICENSE, "general.license" },
{ LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },

View File

@@ -123,6 +123,18 @@ enum llm_kv {
LLM_KV_GENERAL_QUANTIZATION_VERSION,
LLM_KV_GENERAL_ALIGNMENT,
LLM_KV_GENERAL_FILE_TYPE,
LLM_KV_GENERAL_SAMPLING_SEQUENCE,
LLM_KV_GENERAL_SAMPLING_TOP_K,
LLM_KV_GENERAL_SAMPLING_TOP_P,
LLM_KV_GENERAL_SAMPLING_MIN_P,
LLM_KV_GENERAL_SAMPLING_XTC_PROBABILITY,
LLM_KV_GENERAL_SAMPLING_XTC_THRESHOLD,
LLM_KV_GENERAL_SAMPLING_TEMP,
LLM_KV_GENERAL_SAMPLING_PENALTY_LAST_N,
LLM_KV_GENERAL_SAMPLING_PENALTY_REPEAT,
LLM_KV_GENERAL_SAMPLING_MIROSTAT,
LLM_KV_GENERAL_SAMPLING_MIROSTAT_TAU,
LLM_KV_GENERAL_SAMPLING_MIROSTAT_ETA,
LLM_KV_GENERAL_NAME,
LLM_KV_GENERAL_AUTHOR,
LLM_KV_GENERAL_VERSION,

View File

@@ -1248,7 +1248,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
// make the outputs have the same order they had in the user-provided batch
// note: this is mostly relevant for recurrent models atm
if (!sorted_output) {
if (!sorted_output && n_outputs > 1) {
GGML_ASSERT((size_t) n_outputs == out_ids.size());
// TODO: is there something more efficient which also minimizes swaps?

View File

@@ -7687,6 +7687,24 @@ int32_t llama_model_meta_count(const llama_model * model) {
return (int)model->gguf_kv.size();
}
const char * llama_model_meta_key_str(llama_model_meta_key key) {
switch (key) {
case LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE: return "general.sampling.sequence";
case LLAMA_MODEL_META_KEY_SAMPLING_TOP_K: return "general.sampling.top_k";
case LLAMA_MODEL_META_KEY_SAMPLING_TOP_P: return "general.sampling.top_p";
case LLAMA_MODEL_META_KEY_SAMPLING_MIN_P: return "general.sampling.min_p";
case LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY: return "general.sampling.xtc_probability";
case LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD: return "general.sampling.xtc_threshold";
case LLAMA_MODEL_META_KEY_SAMPLING_TEMP: return "general.sampling.temp";
case LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N: return "general.sampling.penalty_last_n";
case LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT: return "general.sampling.penalty_repeat";
case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT: return "general.sampling.mirostat";
case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU: return "general.sampling.mirostat_tau";
case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA: return "general.sampling.mirostat_eta";
default: return nullptr;
}
}
int32_t llama_model_meta_key_by_index(const llama_model * model, int i, char * buf, size_t buf_size) {
if (i < 0 || i >= (int)model->gguf_kv.size()) {
if (buf_size > 0) {

View File

@@ -7859,6 +7859,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
}
}
// Qwen3-VL-8B https://github.com/ggml-org/llama.cpp/issues/17012
test_cases.emplace_back(new test_flash_attn_ext(72, 72, 16, {1, 1}, 5776, 5776, false, false, 0, 0, GGML_PREC_F32, GGML_TYPE_F16));
for (int kv : { 4096, 8192, 16384, }) {
for (int hs : { 64, 128, }) {
for (int nr : { 1, 4, }) {

Binary file not shown.

View File

@@ -8,6 +8,7 @@
import rehypeKatex from 'rehype-katex';
import rehypeStringify from 'rehype-stringify';
import { copyCodeToClipboard } from '$lib/utils/copy';
import { rehypeRestoreTableHtml } from '$lib/markdown/table-html-restorer';
import { preprocessLaTeX } from '$lib/utils/latex-protection';
import { browser } from '$app/environment';
import '$styles/katex-custom.scss';
@@ -60,6 +61,7 @@
.use(remarkRehype) // Convert Markdown AST to rehype
.use(rehypeKatex) // Render math using KaTeX
.use(rehypeHighlight) // Add syntax highlighting
.use(rehypeRestoreTableHtml) // Restore limited HTML (e.g., <br>, <ul>) inside Markdown tables
.use(rehypeStringify); // Convert to HTML string
});

View File

@@ -0,0 +1,20 @@
/**
* Matches <br>, <br/>, <br /> tags (case-insensitive).
* Used to detect line breaks in table cell text content.
*/
export const BR_PATTERN = /<br\s*\/?\s*>/gi;
/**
* Matches a complete <ul>...</ul> block.
* Captures the inner content (group 1) for further <li> extraction.
* Case-insensitive, allows multiline content.
*/
export const LIST_PATTERN = /^<ul>([\s\S]*)<\/ul>$/i;
/**
* Matches individual <li>...</li> elements within a list.
* Captures the inner content (group 1) of each list item.
* Non-greedy to handle multiple consecutive items.
* Case-insensitive, allows multiline content.
*/
export const LI_PATTERN = /<li>([\s\S]*?)<\/li>/gi;

View File

@@ -0,0 +1,181 @@
/**
* Rehype plugin to restore limited HTML elements inside Markdown table cells.
*
* ## Problem
* The remark/rehype pipeline neutralizes inline HTML as literal text
* (remarkLiteralHtml) so that XML/HTML snippets in LLM responses display
* as-is instead of being rendered. This causes <br> and <ul> markup in
* table cells to show as plain text.
*
* ## Solution
* This plugin traverses the HAST post-conversion, parses whitelisted HTML
* patterns from text nodes, and replaces them with actual HAST element nodes
* that will be rendered as real HTML.
*
* ## Supported HTML
* - `<br>` / `<br/>` / `<br />` - Line breaks (inline)
* - `<ul><li>...</li></ul>` - Unordered lists (block)
*
* ## Key Implementation Details
*
* ### 1. Sibling Combination (Critical)
* The Markdown pipeline may fragment content across multiple text nodes and `<br>`
* elements. For example, `<ul><li>a</li></ul>` might arrive as:
* - Text: `"<ul>"`
* - Element: `<br>`
* - Text: `"<li>a</li></ul>"`
*
* We must combine consecutive text nodes and `<br>` elements into a single string
* before attempting to parse list markup. Without this, list detection fails.
*
* ### 2. visitParents for Deep Traversal
* Table cell content may be wrapped in intermediate elements (e.g., `<p>` tags).
* Using `visitParents` instead of direct child iteration ensures we find text
* nodes at any depth within the cell.
*
* ### 3. Reference Comparison for No-Op Detection
* When checking if `<br>` expansion changed anything, we compare:
* `expanded.length !== 1 || expanded[0] !== textNode`
*
* This catches both cases:
* - Multiple nodes created (text was split)
* - Single NEW node created (original had only `<br>`, now it's an element)
*
* A simple `length > 1` check would miss the single `<br>` case.
*
* ### 4. Strict List Validation
* `parseList()` rejects malformed markup by checking for garbage text between
* `<li>` elements. This prevents creating broken DOM from partial matches like
* `<ul>garbage<li>a</li></ul>`.
*
* ### 5. Newline Substitution for `<br>` in Combined String
* When combining siblings, existing `<br>` elements become `\n` in the combined
* string. This allows list content to span visual lines while still being parsed
* as a single unit.
*
* @example
* // Input Markdown:
* // | Feature | Notes |
* // |---------|-------|
* // | Multi-line | First<br>Second |
* // | List | <ul><li>A</li><li>B</li></ul> |
* //
* // Without this plugin: <br> and <ul> render as literal text
* // With this plugin: <br> becomes line break, <ul> becomes actual list
*/
import type { Plugin } from 'unified';
import type { Element, ElementContent, Root, Text } from 'hast';
import { visit } from 'unist-util-visit';
import { visitParents } from 'unist-util-visit-parents';
import { BR_PATTERN, LIST_PATTERN, LI_PATTERN } from '$lib/constants/table-html-restorer';
/**
* Expands text containing `<br>` tags into an array of text nodes and br elements.
*/
function expandBrTags(value: string): ElementContent[] {
const matches = [...value.matchAll(BR_PATTERN)];
if (!matches.length) return [{ type: 'text', value } as Text];
const result: ElementContent[] = [];
let cursor = 0;
for (const m of matches) {
if (m.index! > cursor) {
result.push({ type: 'text', value: value.slice(cursor, m.index) } as Text);
}
result.push({ type: 'element', tagName: 'br', properties: {}, children: [] } as Element);
cursor = m.index! + m[0].length;
}
if (cursor < value.length) {
result.push({ type: 'text', value: value.slice(cursor) } as Text);
}
return result;
}
/**
* Parses a `<ul><li>...</li></ul>` string into a HAST element.
* Returns null if the markup is malformed or contains unexpected content.
*/
function parseList(value: string): Element | null {
const match = value.trim().match(LIST_PATTERN);
if (!match) return null;
const body = match[1];
const items: ElementContent[] = [];
let cursor = 0;
for (const liMatch of body.matchAll(LI_PATTERN)) {
// Reject if there's non-whitespace between list items
if (body.slice(cursor, liMatch.index!).trim()) return null;
items.push({
type: 'element',
tagName: 'li',
properties: {},
children: expandBrTags(liMatch[1] ?? '')
} as Element);
cursor = liMatch.index! + liMatch[0].length;
}
// Reject if no items found or trailing garbage exists
if (!items.length || body.slice(cursor).trim()) return null;
return { type: 'element', tagName: 'ul', properties: {}, children: items } as Element;
}
/**
* Processes a single table cell, restoring HTML elements from text content.
*/
function processCell(cell: Element) {
visitParents(cell, 'text', (textNode: Text, ancestors) => {
const parent = ancestors[ancestors.length - 1];
if (!parent || parent.type !== 'element') return;
const parentEl = parent as Element;
const siblings = parentEl.children as ElementContent[];
const startIndex = siblings.indexOf(textNode as ElementContent);
if (startIndex === -1) return;
// Combine consecutive text nodes and <br> elements into one string
let combined = '';
let endIndex = startIndex;
for (let i = startIndex; i < siblings.length; i++) {
const sib = siblings[i];
if (sib.type === 'text') {
combined += (sib as Text).value;
endIndex = i;
} else if (sib.type === 'element' && (sib as Element).tagName === 'br') {
combined += '\n';
endIndex = i;
} else {
break;
}
}
// Try parsing as list first (replaces entire combined range)
const list = parseList(combined);
if (list) {
siblings.splice(startIndex, endIndex - startIndex + 1, list);
return;
}
// Otherwise, just expand <br> tags in this text node
const expanded = expandBrTags(textNode.value);
if (expanded.length !== 1 || expanded[0] !== textNode) {
siblings.splice(startIndex, 1, ...expanded);
}
});
}
export const rehypeRestoreTableHtml: Plugin<[], Root> = () => (tree) => {
visit(tree, 'element', (node: Element) => {
if (node.tagName === 'td' || node.tagName === 'th') {
processCell(node);
}
});
};