mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-16 16:27:32 +03:00
Compare commits
9 Commits
b7146
...
0cc4m/mode
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
05429433a1 | ||
|
|
064c90d843 | ||
|
|
b1846f1c8e | ||
|
|
d414db02d3 | ||
|
|
877566d512 | ||
|
|
3d07caa99b | ||
|
|
134e6940ca | ||
|
|
0543f928a3 | ||
|
|
b61de2b2df |
@@ -1232,6 +1232,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params, const std::string & value) {
|
||||
const auto sampler_names = string_split<std::string>(value, ';');
|
||||
params.sampling.samplers = common_sampler_types_from_names(sampler_names, true);
|
||||
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS;
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
@@ -1261,6 +1262,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sampling.temp = std::stof(value);
|
||||
params.sampling.temp = std::max(params.sampling.temp, 0.0f);
|
||||
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TEMP;
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
@@ -1268,6 +1270,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
string_format("top-k sampling (default: %d, 0 = disabled)", params.sampling.top_k),
|
||||
[](common_params & params, int value) {
|
||||
params.sampling.top_k = value;
|
||||
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K;
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
@@ -1275,6 +1278,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sampling.top_p = std::stof(value);
|
||||
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_P;
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
@@ -1282,6 +1286,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sampling.min_p),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sampling.min_p = std::stof(value);
|
||||
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIN_P;
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
@@ -1296,6 +1301,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sampling.xtc_probability = std::stof(value);
|
||||
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY;
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
@@ -1303,6 +1309,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sampling.xtc_threshold),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sampling.xtc_threshold = std::stof(value);
|
||||
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD;
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
@@ -1321,6 +1328,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
params.sampling.penalty_last_n = value;
|
||||
params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n);
|
||||
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N;
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
@@ -1328,6 +1336,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sampling.penalty_repeat),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sampling.penalty_repeat = std::stof(value);
|
||||
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT;
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
@@ -1425,6 +1434,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
"(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sampling.mirostat),
|
||||
[](common_params & params, int value) {
|
||||
params.sampling.mirostat = value;
|
||||
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT;
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
@@ -1432,6 +1442,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sampling.mirostat_eta),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sampling.mirostat_eta = std::stof(value);
|
||||
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA;
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
@@ -1439,6 +1450,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sampling.mirostat_tau),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sampling.mirostat_tau = std::stof(value);
|
||||
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU;
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
#include "sampling.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cinttypes>
|
||||
@@ -949,6 +950,58 @@ std::vector<common_file_info> fs_list_files(const std::string & path) {
|
||||
// Model utils
|
||||
//
|
||||
|
||||
static inline void common_init_sampler_from_model(
|
||||
const llama_model * model,
|
||||
common_params_sampling & sparams) {
|
||||
|
||||
const uint64_t config = sparams.user_sampling_config;
|
||||
|
||||
auto get_int32 = [&](const char * key, int32_t & dst, uint64_t user_config) {
|
||||
if (config & user_config) return;
|
||||
|
||||
char buf[64] = {0};
|
||||
if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
|
||||
char * end = nullptr;
|
||||
int32_t v = strtol(buf, &end, 10);
|
||||
if (end && end != buf) dst = v;
|
||||
}
|
||||
};
|
||||
|
||||
auto get_float = [&](const char * key, float & dst, uint64_t user_config) {
|
||||
if (config & user_config) return;
|
||||
|
||||
char buf[128] = {0};
|
||||
if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
|
||||
char * end = nullptr;
|
||||
float v = strtof(buf, &end);
|
||||
if (end && end != buf) dst = v;
|
||||
}
|
||||
};
|
||||
|
||||
// Sampling sequence
|
||||
if (!(config & common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS)) {
|
||||
char buf[512] = {0};
|
||||
if (llama_model_meta_val_str(model, llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE), buf, sizeof(buf)) > 0) {
|
||||
const std::vector<std::string> sampler_names = string_split<std::string>(std::string(buf), ';');
|
||||
if (!sampler_names.empty()) {
|
||||
sparams.samplers = common_sampler_types_from_names(sampler_names, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TOP_K), sparams.top_k, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K);
|
||||
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TOP_P), sparams.top_p, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_P);
|
||||
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIN_P), sparams.min_p, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIN_P);
|
||||
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY), sparams.xtc_probability, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY);
|
||||
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD), sparams.xtc_threshold, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD);
|
||||
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TEMP), sparams.temp, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TEMP);
|
||||
get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N), sparams.penalty_last_n, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N);
|
||||
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT), sparams.penalty_repeat, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT);
|
||||
get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT), sparams.mirostat, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT);
|
||||
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU), sparams.mirostat_tau, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU);
|
||||
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA), sparams.mirostat_eta, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA);
|
||||
}
|
||||
|
||||
struct common_init_result common_init_from_params(common_params & params) {
|
||||
common_init_result iparams;
|
||||
auto mparams = common_model_params_to_llama(params);
|
||||
@@ -960,6 +1013,8 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||
return iparams;
|
||||
}
|
||||
|
||||
common_init_sampler_from_model(model, params.sampling);
|
||||
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
auto cparams = common_context_params_to_llama(params);
|
||||
|
||||
@@ -140,6 +140,22 @@ struct common_grammar_trigger {
|
||||
llama_token token = LLAMA_TOKEN_NULL;
|
||||
};
|
||||
|
||||
enum common_params_sampling_config : uint64_t {
|
||||
COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS = 1 << 0,
|
||||
COMMON_PARAMS_SAMPLING_CONFIG_TOP_K = 1 << 1,
|
||||
COMMON_PARAMS_SAMPLING_CONFIG_TOP_P = 1 << 2,
|
||||
COMMON_PARAMS_SAMPLING_CONFIG_MIN_P = 1 << 3,
|
||||
COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY = 1 << 4,
|
||||
COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD = 1 << 5,
|
||||
COMMON_PARAMS_SAMPLING_CONFIG_TEMP = 1 << 6,
|
||||
COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N = 1 << 7,
|
||||
COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT = 1 << 8,
|
||||
COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT = 1 << 9,
|
||||
COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU = 1 << 10,
|
||||
COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA = 1 << 11,
|
||||
};
|
||||
|
||||
|
||||
// sampling parameters
|
||||
struct common_params_sampling {
|
||||
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
||||
@@ -172,6 +188,8 @@ struct common_params_sampling {
|
||||
bool no_perf = false; // disable performance metrics
|
||||
bool timing_per_token = false;
|
||||
|
||||
uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers
|
||||
|
||||
std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
|
||||
|
||||
|
||||
|
||||
@@ -565,7 +565,7 @@ class ModelBase:
|
||||
gguf.MODEL_TENSOR.ALTUP_PREDICT_COEF,
|
||||
)
|
||||
)
|
||||
or not new_name.endswith(".weight")
|
||||
or new_name[-7:] not in (".weight", ".lora_a", ".lora_b")
|
||||
):
|
||||
data_qtype = gguf.GGMLQuantizationType.F32
|
||||
|
||||
|
||||
@@ -242,7 +242,7 @@ def parse_args() -> argparse.Namespace:
|
||||
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
|
||||
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f32",
|
||||
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
||||
)
|
||||
parser.add_argument(
|
||||
|
||||
@@ -17,6 +17,7 @@ else()
|
||||
add_subdirectory(batched)
|
||||
add_subdirectory(embedding)
|
||||
add_subdirectory(eval-callback)
|
||||
add_subdirectory(model-backend-compare)
|
||||
|
||||
add_subdirectory(gguf-hash)
|
||||
add_subdirectory(gguf)
|
||||
|
||||
15
examples/model-backend-compare/CMakeLists.txt
Normal file
15
examples/model-backend-compare/CMakeLists.txt
Normal file
@@ -0,0 +1,15 @@
|
||||
set(TARGET llama-model-backend-compare)
|
||||
add_executable(${TARGET} model-backend-compare.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
set(TEST_TARGET test-model-backend-compare)
|
||||
if(NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
|
||||
add_test(NAME ${TEST_TARGET}
|
||||
COMMAND llama-model-backend-compare --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
|
||||
else()
|
||||
add_test(NAME ${TEST_TARGET}
|
||||
COMMAND llama-model-backend-compare --hf-repo ggml-org/models --hf-file tinyllamas/stories260K-be.gguf --model stories260K-be.gguf --prompt hello --seed 42 -ngl 0)
|
||||
endif()
|
||||
set_property(TEST ${TEST_TARGET} PROPERTY LABELS model-backend-compare curl)
|
||||
384
examples/model-backend-compare/model-backend-compare.cpp
Normal file
384
examples/model-backend-compare/model-backend-compare.cpp
Normal file
@@ -0,0 +1,384 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <vector>
|
||||
#include <cstdint>
|
||||
#include <unordered_map>
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
#include <cstring>
|
||||
#include <cmath>
|
||||
|
||||
namespace {
|
||||
constexpr double nmse_threshold = 1e-2;
|
||||
|
||||
struct callback_data {
|
||||
std::vector<uint8_t> data;
|
||||
std::vector<float> device_results;
|
||||
std::unordered_map<std::string, std::vector<float>> cpu_results;
|
||||
};
|
||||
|
||||
bool gather = true;
|
||||
|
||||
// normalized mean squared error = mse(a, b) / mse(a, 0)
|
||||
double nmse(const float * a, const float * b, size_t n) {
|
||||
double mse_a_b = 0.0;
|
||||
double mse_a_0 = 0.0;
|
||||
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
float a_i = a[i];
|
||||
float b_i = b[i];
|
||||
|
||||
mse_a_b += (a_i - b_i) * (a_i - b_i);
|
||||
mse_a_0 += a_i * a_i;
|
||||
}
|
||||
|
||||
return mse_a_b / mse_a_0;
|
||||
}
|
||||
|
||||
void ggml_print_tensor(const ggml_tensor * t, const std::vector<float> data, int64_t n) {
|
||||
GGML_ASSERT(n > 0);
|
||||
float sum = 0;
|
||||
for (int64_t i3 = 0; i3 < t->ne[3]; i3++) {
|
||||
for (int64_t i2 = 0; i2 < t->ne[2]; i2++) {
|
||||
for (int64_t i1 = 0; i1 < t->ne[1]; i1++) {
|
||||
for (int64_t i0 = 0; i0 < t->ne[0]; i0++) {
|
||||
const float v = data[i3 * t->ne[2] * t->ne[1] * t->ne[0] + i2 * t->ne[1] * t->ne[0] + i1 * t->ne[0] + i0];
|
||||
sum += v;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int64_t i3 = 0; i3 < t->ne[3]; i3++) {
|
||||
LOG(" [\n");
|
||||
for (int64_t i2 = 0; i2 < t->ne[2]; i2++) {
|
||||
if (i2 == n && t->ne[2] > 2*n) {
|
||||
LOG(" ..., \n");
|
||||
i2 = t->ne[2] - n;
|
||||
}
|
||||
LOG(" [\n");
|
||||
for (int64_t i1 = 0; i1 < t->ne[1]; i1++) {
|
||||
if (i1 == n && t->ne[1] > 2*n) {
|
||||
LOG(" ..., \n");
|
||||
i1 = t->ne[1] - n;
|
||||
}
|
||||
LOG(" [");
|
||||
for (int64_t i0 = 0; i0 < t->ne[0]; i0++) {
|
||||
if (i0 == n && t->ne[0] > 2*n) {
|
||||
LOG("..., ");
|
||||
i0 = t->ne[0] - n;
|
||||
}
|
||||
const float v = data[i3 * t->ne[2] * t->ne[1] * t->ne[0] + i2 * t->ne[1] * t->ne[0] + i1 * t->ne[0] + i0];
|
||||
LOG("%12.4f", v);
|
||||
if (i0 < t->ne[0] - 1) LOG(", ");
|
||||
}
|
||||
LOG("],\n");
|
||||
}
|
||||
LOG(" ],\n");
|
||||
}
|
||||
LOG(" ]\n");
|
||||
LOG(" sum = %f\n", sum);
|
||||
}
|
||||
|
||||
if (std::isnan(sum)) {
|
||||
LOG_ERR("encountered NaN - aborting\n");
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
|
||||
union {
|
||||
float f;
|
||||
uint32_t i;
|
||||
} u;
|
||||
u.i = (uint32_t)h.bits << 16;
|
||||
return u.f;
|
||||
}
|
||||
|
||||
float to_float(const uint8_t * ptr, ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
return *(const float *)ptr;
|
||||
case GGML_TYPE_F16:
|
||||
return ggml_fp16_to_fp32(*(const ggml_fp16_t *)ptr);
|
||||
case GGML_TYPE_BF16:
|
||||
return ggml_compute_bf16_to_fp32(*(const ggml_bf16_t *)ptr);
|
||||
case GGML_TYPE_I8:
|
||||
return static_cast<float>(*(const int8_t *)ptr);
|
||||
case GGML_TYPE_I16:
|
||||
return static_cast<float>(*(const int16_t *)ptr);
|
||||
case GGML_TYPE_I32:
|
||||
return static_cast<float>(*(const int32_t *)ptr);
|
||||
case GGML_TYPE_I64:
|
||||
return static_cast<float>(*(const int64_t *)ptr);
|
||||
default:
|
||||
GGML_ABORT("unsupported ggml_type %d in to_float", type);
|
||||
}
|
||||
return 0.0f;
|
||||
}
|
||||
|
||||
void tensor_to_float_array(const ggml_tensor * t, const void * data, std::vector<float> & out) {
|
||||
const size_t n_elements = ggml_nelements(t);
|
||||
out.resize(n_elements);
|
||||
|
||||
// convert to float
|
||||
size_t idx = 0;
|
||||
for (int64_t i3 = 0; i3 < t->ne[3]; i3++) {
|
||||
for (int64_t i2 = 0; i2 < t->ne[2]; i2++) {
|
||||
for (int64_t i1 = 0; i1 < t->ne[1]; i1++) {
|
||||
if (!ggml_is_quantized(t->type)) {
|
||||
for (int64_t i0 = 0; i0 < t->ne[0]; i0++) {
|
||||
const uint8_t * ptr = ((const uint8_t *)data) + i3 * t->nb[3] + i2 * t->nb[2] + i1 * t->nb[1] + i0 * t->nb[0];
|
||||
|
||||
out[idx] = to_float(ptr, t->type);
|
||||
idx++;
|
||||
}
|
||||
} else {
|
||||
GGML_ABORT("quantized types are not supported in tensor_to_float_array");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool tensor_is_empty(ggml_tensor * node) {
|
||||
return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE;
|
||||
}
|
||||
|
||||
std::string remove_device_from_name(const std::string & name) {
|
||||
// Remove prefix and suffix
|
||||
// Example: Vulkan0#inp_embd#0 -> inp_embd
|
||||
size_t start = name.find_first_of('#');
|
||||
size_t end = name.find_last_of('#');
|
||||
if (start != std::string::npos && end != std::string::npos &&
|
||||
end > start) {
|
||||
return name.substr(start + 1, end - start - 1);
|
||||
}
|
||||
return name;
|
||||
}
|
||||
|
||||
std::string tensor_name(ggml_tensor * t) {
|
||||
const std::string tname(t->name, strnlen(t->name, GGML_MAX_NAME));
|
||||
|
||||
std::stringstream ss;
|
||||
ss << tname << "[";
|
||||
// Get last source
|
||||
size_t last_src = 0;
|
||||
for (size_t i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (t->src[i]) {
|
||||
last_src = i;
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (t->src[i]) {
|
||||
const std::string src_name(t->src[i]->name, strnlen(t->src[i]->name, GGML_MAX_NAME));
|
||||
ss << remove_device_from_name(src_name);
|
||||
if (i < last_src) {
|
||||
ss << ", ";
|
||||
}
|
||||
}
|
||||
}
|
||||
ss << "]";
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||
auto * cb_data = (callback_data *) user_data;
|
||||
|
||||
if (ask || tensor_is_empty(t)) {
|
||||
return true; // Always retrieve data
|
||||
}
|
||||
|
||||
const std::string name = tensor_name(t);
|
||||
|
||||
if (gather) {
|
||||
// CPU data should be host-visible
|
||||
GGML_ASSERT(ggml_backend_buffer_is_host(t->buffer));
|
||||
|
||||
// Make sure this tensor does not exist yet
|
||||
if (cb_data->cpu_results.find(name) != cb_data->cpu_results.end())
|
||||
{
|
||||
LOG_ERR("%s : tensor '%s' already exists in CPU reference data\n", __func__, name.c_str());
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
std::vector<float>& result = cb_data->cpu_results[name];
|
||||
|
||||
// LOG("gathering CPU reference data for tensor '%s'\n", name.c_str());
|
||||
// for (size_t i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
// LOG(" ne[%zu] = %lld\n", i, t->ne[i]);
|
||||
// }
|
||||
// for (size_t i = 0; i < GGML_MAX_SRC; i++) {
|
||||
// if (t->src[i]) {
|
||||
// const std::string src_name(t->src[i]->name, strnlen(t->src[i]->name, GGML_MAX_NAME));
|
||||
// LOG(" src[%zu] = %s\n", i, src_name.c_str());
|
||||
// }
|
||||
// }
|
||||
|
||||
tensor_to_float_array(t, t->data, result);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Compare with CPU data if available
|
||||
auto it = cb_data->cpu_results.find(name);
|
||||
if (it == cb_data->cpu_results.end()) {
|
||||
LOG_ERR("no CPU reference data for tensor '%s'\n", name.c_str());
|
||||
return true;
|
||||
}
|
||||
|
||||
const bool is_host = ggml_backend_buffer_is_host(t->buffer);
|
||||
const size_t n_bytes = ggml_nbytes(t);
|
||||
|
||||
const uint8_t * data;
|
||||
|
||||
if (!is_host) {
|
||||
if (cb_data->data.size() < n_bytes) {
|
||||
cb_data->data.resize(n_bytes);
|
||||
}
|
||||
|
||||
ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
|
||||
data = cb_data->data.data();
|
||||
} else {
|
||||
data = (const uint8_t *) t->data;
|
||||
}
|
||||
|
||||
tensor_to_float_array(t, data, cb_data->device_results);
|
||||
|
||||
const std::vector<float>& ref_data = it->second;
|
||||
|
||||
double error = nmse(ref_data.data(), cb_data->device_results.data(), ref_data.size());
|
||||
|
||||
if (error > nmse_threshold) {
|
||||
LOG_ERR("nmse = %.12f tensor '%s' op=%s\n", error, name.c_str(), ggml_op_name(t->op));
|
||||
LOG_ERR(" ne: ");
|
||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
LOG_ERR("%ld ", t->ne[i]);
|
||||
}
|
||||
LOG_ERR("\n nb: ");
|
||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
LOG_ERR("%zu ", t->nb[i]);
|
||||
}
|
||||
LOG_ERR("\n\n");
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (t->src[i]) {
|
||||
const std::string src_name(t->src[i]->name, strnlen(t->src[i]->name, GGML_MAX_NAME));
|
||||
LOG_ERR(" src%d: %s\n", i, src_name.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
LOG_ERR("CPU reference data for tensor '%s':\n", name.c_str());
|
||||
ggml_print_tensor(t, ref_data, 2);
|
||||
|
||||
LOG_ERR("Device data for tensor '%s':\n", name.c_str());
|
||||
ggml_print_tensor(t, cb_data->device_results, 2);
|
||||
return false;
|
||||
} else {
|
||||
LOG("nmse = %.12f tensor '%s' op = %s\n", error, name.c_str(), ggml_op_name(t->op));
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool run(llama_context * ctx, const common_params & params) {
|
||||
const llama_model * model = llama_get_model(ctx);
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
const bool add_bos = llama_vocab_get_add_bos(vocab);
|
||||
|
||||
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
|
||||
|
||||
if (tokens.empty()) {
|
||||
LOG_ERR("%s : there are not input tokens to process - (try to provide a prompt with '-p')\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
|
||||
LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
callback_data cb_data;
|
||||
|
||||
common_params params;
|
||||
params.prompt = "The quick brown fox";
|
||||
params.sampling.seed = 1234;
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
common_init();
|
||||
|
||||
llama_backend_init();
|
||||
|
||||
// pass the callback to the backend scheduler
|
||||
// it will be executed for each node during the graph computation
|
||||
params.cb_eval = ggml_debug;
|
||||
params.cb_eval_user_data = &cb_data;
|
||||
params.warmup = false;
|
||||
|
||||
params.split_mode = LLAMA_SPLIT_MODE_NONE;
|
||||
|
||||
const size_t n_dev = ggml_backend_dev_count();
|
||||
|
||||
for (size_t i = 0; i < n_dev * 2; i++) {
|
||||
ggml_backend_dev_t device = ggml_backend_dev_get(i % ggml_backend_dev_count());
|
||||
|
||||
// Run CPU-only first to gather reference results
|
||||
if ((i < n_dev && ggml_backend_dev_type(device) != GGML_BACKEND_DEVICE_TYPE_CPU) ||
|
||||
(i >= n_dev && ggml_backend_dev_type(device) == GGML_BACKEND_DEVICE_TYPE_CPU)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
params.devices.clear();
|
||||
params.devices.push_back(device);
|
||||
|
||||
if (i < n_dev) {
|
||||
LOG_INF("=== Running on device %zu (gathering reference results) ===\n", i);
|
||||
gather = true;
|
||||
} else {
|
||||
LOG_INF("=== Running on device %zu ===\n", i - n_dev);
|
||||
gather = false;
|
||||
}
|
||||
|
||||
// init
|
||||
common_init_result llama_init = common_init_from_params(params);
|
||||
|
||||
llama_model * model = llama_init.model.get();
|
||||
llama_context * ctx = llama_init.context.get();
|
||||
|
||||
if (model == nullptr || ctx == nullptr) {
|
||||
LOG_ERR("%s : failed to init\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// print system information
|
||||
{
|
||||
LOG_INF("\n");
|
||||
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
||||
LOG_INF("\n");
|
||||
}
|
||||
|
||||
bool OK = run(ctx, params);
|
||||
if (!OK) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
LOG("\n");
|
||||
llama_perf_context_print(ctx);
|
||||
|
||||
llama_backend_free();
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -42,6 +42,7 @@
|
||||
#include <aclnnop/aclnn_exp.h>
|
||||
#include <aclnnop/aclnn_fill_scalar.h>
|
||||
#include <aclnnop/aclnn_fused_infer_attention_score_v2.h>
|
||||
#include <aclnnop/aclnn_ger.h>
|
||||
#include <aclnnop/aclnn_group_norm.h>
|
||||
#include <aclnnop/aclnn_grouped_matmul_v3.h>
|
||||
#include <aclnnop/aclnn_gt_scalar.h>
|
||||
@@ -3236,3 +3237,64 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context & ctx, ggml_tensor * dst
|
||||
GGML_ABORT("Function is not implemented.");
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cann_out_prod_fp(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
ggml_tensor * src0 = dst->src[0]; // weight
|
||||
ggml_tensor * src1 = dst->src[1]; // input
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, acl_dst.get());
|
||||
|
||||
const int64_t dps2 = ne2 / ne02;
|
||||
const int64_t dps3 = ne3 / ne03;
|
||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
||||
const int64_t i02 = i2 / dps2;
|
||||
const int64_t i03 = i3 / dps3;
|
||||
|
||||
const int64_t i12 = i2;
|
||||
const int64_t i13 = i3;
|
||||
acl_tensor_ptr accumulator =
|
||||
ggml_cann_create_tensor((char *) dst->data + i2 * nb2 + i3 * nb3, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), dst->ne, dst->nb, 2);
|
||||
|
||||
// The outer product needs to be accumulated in this dimension.
|
||||
for (int64_t i1 = 0; i1 < ne11; i1++) {
|
||||
acl_tensor_ptr acl_input = ggml_cann_create_tensor(
|
||||
(char *) src1->data + i1 * nb11 + i12 * nb12 + i13 * nb13, ggml_cann_type_mapping(src0->type),
|
||||
ggml_type_size(src0->type), src1->ne, src1->nb, 1);
|
||||
|
||||
acl_tensor_ptr acl_weight = ggml_cann_create_tensor(
|
||||
(char *) src0->data + i1 * nb01 + i02 * nb02 + i03 * nb03, ggml_cann_type_mapping(src0->type),
|
||||
ggml_type_size(src0->type), src0->ne, src0->nb, 1);
|
||||
|
||||
ggml_cann_pool_alloc output_allocator(ctx.pool());
|
||||
void * output_buffer = output_allocator.alloc(ggml_nbytes(dst));
|
||||
acl_tensor_ptr acl_out = ggml_cann_create_tensor(output_buffer, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), dst->ne, dst->nb, 2);
|
||||
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, Ger, acl_input.get(), acl_weight.get(), acl_out.get());
|
||||
float alpha_value = 1.0f;
|
||||
aclScalar * alpha = aclCreateScalar(&alpha_value, ACL_FLOAT);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, accumulator.get(), acl_out.get(), alpha);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cann_out_prod(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
const enum ggml_type type = src0->type;
|
||||
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
ggml_cann_out_prod_fp(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("Unsupport type for GGML_OP_OUT_PROD");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1125,3 +1125,23 @@ void ggml_cann_op_unary_gated(std::function<void(ggml_backend_cann_context &, ac
|
||||
} while (0)
|
||||
|
||||
#endif // CANN_ACLNN_OPS
|
||||
|
||||
/**
|
||||
* @brief Performs outer product operation on two ggml tensors using the CANN backend.
|
||||
*
|
||||
* @details This function computes the outer product of two input tensors (src0 and src1)
|
||||
* and stores the result in the destination tensor. The outer product operation is defined as:
|
||||
* dst[i,j,k,l] = sum_m (src0[i,m,k,l] * src1[j,m,k,l])
|
||||
*
|
||||
* The function supports multiple data types including F32, F16. For floating-point
|
||||
* types, it uses batch matrix multiplication for efficient computation.
|
||||
*
|
||||
* The implementation handles 4D tensor broadcasting and batch processing automatically.
|
||||
*
|
||||
* @param ctx The CANN backend context for operation execution and memory management.
|
||||
* @param dst The destination ggml_tensor where the outer product result will be stored.
|
||||
* The input tensors are assumed to be `dst->src[0]` and `dst->src[1]`.
|
||||
*
|
||||
* @see GGML_CANN_CALL_ACLNN_OP for CANN operator invocation
|
||||
*/
|
||||
void ggml_cann_out_prod(ggml_backend_cann_context & ctx, ggml_tensor * dst);
|
||||
|
||||
@@ -1886,6 +1886,9 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context & ctx, struct gg
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
ggml_cann_flash_attn_ext(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_OUT_PROD:
|
||||
ggml_cann_out_prod(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -2563,6 +2566,16 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
|
||||
case GGML_OP_PAD_REFLECT_1D:
|
||||
case GGML_OP_COUNT_EQUAL:
|
||||
return true;
|
||||
case GGML_OP_OUT_PROD:
|
||||
{
|
||||
switch (op->src[0]->type) {
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_F32:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
// TODO: ((weightL - 1) * dilationW - padLeft)=1336 should not be larger than 255.
|
||||
return (op->src[0]->ne[0] - 1) <= 255;
|
||||
|
||||
@@ -73,34 +73,7 @@ namespace ggml_cuda_mma {
|
||||
static constexpr int I = I_;
|
||||
static constexpr int J = J_;
|
||||
|
||||
#if defined(GGML_USE_HIP)
|
||||
#if defined(RDNA4)
|
||||
static constexpr int ne = I * J / 32;
|
||||
T x[ne] = {0};
|
||||
|
||||
static constexpr __device__ bool supported() {
|
||||
if (I == 16 && J == 16) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int get_i(const int l) {
|
||||
if constexpr (I == 16 && J == 16) {
|
||||
return 8 * (threadIdx.x / 16) + l;
|
||||
} else {
|
||||
NO_DEVICE_CODE;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int get_j(const int l) {
|
||||
if constexpr (I == 16 && J == 16) {
|
||||
return threadIdx.x % 16;
|
||||
} else {
|
||||
NO_DEVICE_CODE;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
#else
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
static constexpr int ne = I * J / 64;
|
||||
T x[ne] = {0};
|
||||
|
||||
@@ -146,7 +119,6 @@ namespace ggml_cuda_mma {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
#endif // defined(RDNA4)
|
||||
#elif __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
static constexpr int ne = I * J / 32;
|
||||
T x[ne] = {0};
|
||||
@@ -177,6 +149,34 @@ namespace ggml_cuda_mma {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
#elif defined(AMD_WMMA_AVAILABLE)
|
||||
#if defined(RDNA4)
|
||||
static constexpr int ne = I * J / 32;
|
||||
T x[ne] = {0};
|
||||
|
||||
static constexpr __device__ bool supported() {
|
||||
if (I == 16 && J == 16) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int get_i(const int l) {
|
||||
if constexpr (I == 16 && J == 16) {
|
||||
return 8 * (threadIdx.x / 16) + l;
|
||||
} else {
|
||||
NO_DEVICE_CODE;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int get_j(const int l) {
|
||||
if constexpr (I == 16 && J == 16) {
|
||||
return threadIdx.x % 16;
|
||||
} else {
|
||||
NO_DEVICE_CODE;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
static constexpr int ne = I * J / 32;
|
||||
T x[ne] = {0};
|
||||
@@ -437,7 +437,20 @@ namespace ggml_cuda_mma {
|
||||
xi[0] = xs[0];
|
||||
}
|
||||
#elif defined(AMD_WMMA_AVAILABLE)
|
||||
ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
|
||||
if constexpr (I == 16 && J == 4) {
|
||||
int64_t * xi = (int64_t *) t.x;
|
||||
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
|
||||
xi[0] = xs[0];
|
||||
}else if constexpr (I == 16 && J == 8) {
|
||||
int64_t * xi = (int64_t *) t.x;
|
||||
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I));
|
||||
xi[0] = xs[0];
|
||||
|
||||
const int64_t * xs1 = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I) + 2);
|
||||
xi[1] = xs1[0];
|
||||
}else{
|
||||
NO_DEVICE_CODE;
|
||||
}
|
||||
#else
|
||||
#pragma unroll
|
||||
for (int l = 0; l < t.ne; ++l) {
|
||||
@@ -772,6 +785,36 @@ namespace ggml_cuda_mma {
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
#endif // defined(CDNA3)
|
||||
|
||||
#elif defined(AMD_WMMA_AVAILABLE)
|
||||
using int32x2_t = __attribute__((__vector_size__(2 * sizeof(int)))) int;
|
||||
int32x2_t * a_vec = (int32x2_t *) A.x;
|
||||
int32x2_t * b_vec = (int32x2_t *) B.x;
|
||||
|
||||
using int32x8_t = __attribute__((__vector_size__(8 * sizeof(int)))) int;
|
||||
int32x8_t * acc = (int32x8_t *) D.x;
|
||||
|
||||
#if defined(RDNA4)
|
||||
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
|
||||
true,
|
||||
a_vec[0],
|
||||
true,
|
||||
b_vec[0],
|
||||
acc[0],
|
||||
true
|
||||
);
|
||||
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
|
||||
true,
|
||||
a_vec[1],
|
||||
true,
|
||||
b_vec[1],
|
||||
acc[0],
|
||||
true
|
||||
);
|
||||
#endif // defined(RDNA4)
|
||||
|
||||
#else
|
||||
GGML_UNUSED_VARS(D, A, B);
|
||||
NO_DEVICE_CODE;
|
||||
@@ -798,6 +841,7 @@ namespace ggml_cuda_mma {
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
#endif // defined(CDNA3)
|
||||
|
||||
#else
|
||||
GGML_UNUSED_VARS(D, A, B);
|
||||
NO_DEVICE_CODE;
|
||||
@@ -842,4 +886,31 @@ namespace ggml_cuda_mma {
|
||||
mma(D16[1], A16[1], B);
|
||||
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void mma(
|
||||
tile<16, 16, int> & D, const tile<16, 4, int> & A, const tile<16, 4, int> & B) {
|
||||
#if defined(AMD_WMMA_AVAILABLE)
|
||||
using int32x2_t = __attribute__((__vector_size__(2 * sizeof(int)))) int;
|
||||
int32x2_t * a_vec = (int32x2_t *) A.x;
|
||||
int32x2_t * b_vec = (int32x2_t *) B.x;
|
||||
|
||||
using int32x8_t = __attribute__((__vector_size__(8 * sizeof(int)))) int;
|
||||
int32x8_t * acc = (int32x8_t *) D.x;
|
||||
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
|
||||
true,
|
||||
a_vec[0],
|
||||
true,
|
||||
b_vec[0],
|
||||
acc[0],
|
||||
false
|
||||
);
|
||||
#else
|
||||
GGML_UNUSED(D);
|
||||
GGML_UNUSED(A);
|
||||
GGML_UNUSED(B);
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -306,5 +306,11 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return (!GGML_CUDA_CC_IS_RDNA4(cc) && !GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
||||
if (amd_wmma_available(cc)) {
|
||||
if (GGML_CUDA_CC_IS_RDNA4(cc)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return (!GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1629,6 +1629,22 @@ class vk_perf_logger {
|
||||
timings[name].push_back(time);
|
||||
return;
|
||||
}
|
||||
if (node->op == GGML_OP_FLASH_ATTN_EXT) {
|
||||
const ggml_tensor * dst = node;
|
||||
const ggml_tensor * q = node->src[0];
|
||||
const ggml_tensor * k = node->src[1];
|
||||
const ggml_tensor * v = node->src[2];
|
||||
const ggml_tensor * m = node->src[3];
|
||||
std::stringstream name;
|
||||
name << ggml_op_name(node->op) <<
|
||||
" dst(" << dst->ne[0] << "," << dst->ne[1] << "," << dst->ne[2] << "," << dst->ne[3] << "), " <<
|
||||
" q(" << q->ne[0] << "," << q->ne[1] << "," << q->ne[2] << "," << q->ne[3] << "), " <<
|
||||
" k(" << k->ne[0] << "," << k->ne[1] << "," << k->ne[2] << "," << k->ne[3] << "), " <<
|
||||
" v(" << v->ne[0] << "," << v->ne[1] << "," << v->ne[2] << "," << v->ne[3] << "), " <<
|
||||
" m(" << (m?m->ne[0]:0) << "," << (m?m->ne[1]:0) << "," << (m?m->ne[2]:0) << "," << (m?m->ne[3]:0) << ")";
|
||||
timings[name.str()].push_back(time);
|
||||
return;
|
||||
}
|
||||
timings[ggml_op_name(node->op)].push_back(time);
|
||||
}
|
||||
private:
|
||||
@@ -2485,9 +2501,11 @@ static void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events
|
||||
static constexpr uint32_t flash_attention_num_small_rows = 32;
|
||||
static constexpr uint32_t scalar_flash_attention_num_small_rows = 1;
|
||||
|
||||
static uint32_t get_fa_scalar_num_large_rows(uint32_t hsv) {
|
||||
static uint32_t get_fa_scalar_num_large_rows(uint32_t hsk, uint32_t hsv) {
|
||||
if (hsv >= 192) {
|
||||
return 2;
|
||||
} else if ((hsv | hsk) & 8) {
|
||||
return 4;
|
||||
} else {
|
||||
return 8;
|
||||
}
|
||||
@@ -2519,9 +2537,9 @@ static std::array<uint32_t, 2> fa_rows_cols(FaCodePath path, uint32_t hsk, uint3
|
||||
if ((hsv | hsk) & 8) {
|
||||
// HSV/HSK not being a multiple of 16 makes D_split smaller, which makes cols_per_iter
|
||||
// larger, and Bc needs to be >= cols_per_thread. 64 is large enough, 32 is not.
|
||||
return {get_fa_scalar_num_large_rows(hsv), 64};
|
||||
return {get_fa_scalar_num_large_rows(hsk, hsv), 64};
|
||||
} else {
|
||||
return {get_fa_scalar_num_large_rows(hsv), 32};
|
||||
return {get_fa_scalar_num_large_rows(hsk, hsv), 32};
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -7724,7 +7742,7 @@ static bool ggml_vk_flash_attn_scalar_shmem_support(const vk_device& device, con
|
||||
// Needs to be kept up to date on shader changes
|
||||
GGML_UNUSED(hsv);
|
||||
const uint32_t wg_size = scalar_flash_attention_workgroup_size;
|
||||
const uint32_t Br = get_fa_scalar_num_large_rows(hsv);
|
||||
const uint32_t Br = get_fa_scalar_num_large_rows(hsk, hsv);
|
||||
const uint32_t Bc = scalar_flash_attention_Bc;
|
||||
|
||||
const uint32_t tmpsh = wg_size * sizeof(float);
|
||||
@@ -7855,7 +7873,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
case FA_SCALAR:
|
||||
case FA_COOPMAT1:
|
||||
// We may switch from coopmat1 to scalar, so use the scalar limit for both
|
||||
max_gqa = get_fa_scalar_num_large_rows(HSV);
|
||||
max_gqa = get_fa_scalar_num_large_rows(HSK, HSV);
|
||||
break;
|
||||
case FA_COOPMAT2:
|
||||
max_gqa = get_fa_num_small_rows(FA_COOPMAT2);
|
||||
|
||||
@@ -25,6 +25,20 @@ class Keys:
|
||||
ALIGNMENT = "general.alignment"
|
||||
FILE_TYPE = "general.file_type"
|
||||
|
||||
# Recommended Sampler Parameters
|
||||
SAMPLING_SEQUENCE = "general.sampling.sequence"
|
||||
SAMPLING_TOP_K = "general.sampling.top_k"
|
||||
SAMPLING_TOP_P = "general.sampling.top_p"
|
||||
SAMPLING_MIN_P = "general.sampling.min_p"
|
||||
SAMPLING_XTC_PROBABILITY = "general.sampling.xtc_probability"
|
||||
SAMPLING_XTC_THRESHOLD = "general.sampling.xtc_threshold"
|
||||
SAMPLING_TEMP = "general.sampling.temp"
|
||||
SAMPLING_PENALTY_LAST_N = "general.sampling.penalty_last_n"
|
||||
SAMPLING_PENALTY_REPEAT = "general.sampling.penalty_repeat"
|
||||
SAMPLING_MIROSTAT = "general.sampling.mirostat"
|
||||
SAMPLING_MIROSTAT_TAU = "general.sampling.mirostat_tau"
|
||||
SAMPLING_MIROSTAT_ETA = "general.sampling.mirostat_eta"
|
||||
|
||||
# Authorship Metadata
|
||||
NAME = "general.name"
|
||||
AUTHOR = "general.author"
|
||||
|
||||
@@ -496,6 +496,42 @@ class GGUFWriter:
|
||||
def add_file_type(self, ftype: int) -> None:
|
||||
self.add_uint32(Keys.General.FILE_TYPE, ftype)
|
||||
|
||||
def add_sampling_sequence(self, sequence: str) -> None:
|
||||
self.add_string(Keys.General.SAMPLING_SEQUENCE, sequence)
|
||||
|
||||
def add_sampling_top_k(self, top_k: int) -> None:
|
||||
self.add_int32(Keys.General.SAMPLING_TOP_K, top_k)
|
||||
|
||||
def add_sampling_top_p(self, top_p: float) -> None:
|
||||
self.add_float32(Keys.General.SAMPLING_TOP_P, top_p)
|
||||
|
||||
def add_sampling_min_p(self, min_p: float) -> None:
|
||||
self.add_float32(Keys.General.SAMPLING_MIN_P, min_p)
|
||||
|
||||
def add_sampling_xtc_probability(self, xtc_probability: float) -> None:
|
||||
self.add_float32(Keys.General.SAMPLING_XTC_PROBABILITY, xtc_probability)
|
||||
|
||||
def add_sampling_xtc_threshold(self, xtc_threshold: float) -> None:
|
||||
self.add_float32(Keys.General.SAMPLING_XTC_THRESHOLD, xtc_threshold)
|
||||
|
||||
def add_sampling_temp(self, temp: float) -> None:
|
||||
self.add_float32(Keys.General.SAMPLING_TEMP, temp)
|
||||
|
||||
def add_sampling_penalty_last_n(self, penalty_last_n: int) -> None:
|
||||
self.add_int32(Keys.General.SAMPLING_PENALTY_LAST_N, penalty_last_n)
|
||||
|
||||
def add_sampling_penalty_repeat(self, penalty_repeat: float) -> None:
|
||||
self.add_float32(Keys.General.SAMPLING_PENALTY_REPEAT, penalty_repeat)
|
||||
|
||||
def add_sampling_mirostat(self, mirostat: int) -> None:
|
||||
self.add_int32(Keys.General.SAMPLING_MIROSTAT, mirostat)
|
||||
|
||||
def add_sampling_mirostat_tau(self, mirostat_tau: float) -> None:
|
||||
self.add_float32(Keys.General.SAMPLING_MIROSTAT_TAU, mirostat_tau)
|
||||
|
||||
def add_sampling_mirostat_eta(self, mirostat_eta: float) -> None:
|
||||
self.add_float32(Keys.General.SAMPLING_MIROSTAT_ETA, mirostat_eta)
|
||||
|
||||
def add_name(self, name: str) -> None:
|
||||
self.add_string(Keys.General.NAME, name)
|
||||
|
||||
|
||||
@@ -17,6 +17,20 @@ logger = logging.getLogger("metadata")
|
||||
|
||||
@dataclass
|
||||
class Metadata:
|
||||
# Recommended Sampler Parameters to be written to GGUF KV Store
|
||||
sampling_sequence: Optional[str] = None
|
||||
sampling_top_k: Optional[int] = None
|
||||
sampling_top_p: Optional[float] = None
|
||||
sampling_min_p: Optional[float] = None
|
||||
sampling_xtc_probability: Optional[float] = None
|
||||
sampling_xtc_threshold: Optional[float] = None
|
||||
sampling_temp: Optional[float] = None
|
||||
sampling_penalty_last_n: Optional[int] = None
|
||||
sampling_penalty_repeat: Optional[float] = None
|
||||
sampling_mirostat: Optional[int] = None
|
||||
sampling_mirostat_tau: Optional[float] = None
|
||||
sampling_mirostat_eta: Optional[float] = None
|
||||
|
||||
# Authorship Metadata to be written to GGUF KV Store
|
||||
name: Optional[str] = None
|
||||
author: Optional[str] = None
|
||||
@@ -54,15 +68,43 @@ class Metadata:
|
||||
|
||||
model_card = Metadata.load_model_card(model_path)
|
||||
hf_params = Metadata.load_hf_parameters(model_path)
|
||||
gen_config = Metadata.load_generation_config(model_path)
|
||||
# TODO: load adapter_config.json when possible, it usually contains the base model of the LoRA adapter
|
||||
|
||||
# heuristics
|
||||
metadata = Metadata.apply_metadata_heuristic(metadata, model_card, hf_params, model_path, total_params)
|
||||
|
||||
if gen_config:
|
||||
metadata.sampling_sequence = gen_config.get("sequence", metadata.sampling_sequence)
|
||||
metadata.sampling_top_k = gen_config.get("top_k", metadata.sampling_top_k)
|
||||
metadata.sampling_top_p = gen_config.get("top_p", metadata.sampling_top_p)
|
||||
metadata.sampling_min_p = gen_config.get("min_p", metadata.sampling_min_p)
|
||||
metadata.sampling_xtc_probability = gen_config.get("xtc_probability", metadata.sampling_xtc_probability)
|
||||
metadata.sampling_xtc_threshold = gen_config.get("xtc_threshold", metadata.sampling_xtc_threshold)
|
||||
metadata.sampling_temp = gen_config.get("temperature", metadata.sampling_temp)
|
||||
metadata.sampling_penalty_last_n = gen_config.get("penalty_last_n", metadata.sampling_penalty_last_n)
|
||||
metadata.sampling_penalty_repeat = gen_config.get("penalty_repeat", metadata.sampling_penalty_repeat)
|
||||
metadata.sampling_mirostat = gen_config.get("mirostat", metadata.sampling_mirostat)
|
||||
metadata.sampling_mirostat_tau = gen_config.get("mirostat_tau", metadata.sampling_mirostat_tau)
|
||||
metadata.sampling_mirostat_eta = gen_config.get("mirostat_eta", metadata.sampling_mirostat_eta)
|
||||
|
||||
# Metadata Override File Provided
|
||||
# This is based on LLM_KV_NAMES mapping in llama.cpp
|
||||
metadata_override = Metadata.load_metadata_override(metadata_override_path)
|
||||
|
||||
metadata.sampling_sequence = metadata_override.get(Keys.General.SAMPLING_SEQUENCE, metadata.sampling_sequence)
|
||||
metadata.sampling_top_k = metadata_override.get(Keys.General.SAMPLING_TOP_K, metadata.sampling_top_k)
|
||||
metadata.sampling_top_p = metadata_override.get(Keys.General.SAMPLING_TOP_P, metadata.sampling_top_p)
|
||||
metadata.sampling_min_p = metadata_override.get(Keys.General.SAMPLING_MIN_P, metadata.sampling_min_p)
|
||||
metadata.sampling_xtc_probability = metadata_override.get(Keys.General.SAMPLING_XTC_PROBABILITY, metadata.sampling_xtc_probability)
|
||||
metadata.sampling_xtc_threshold = metadata_override.get(Keys.General.SAMPLING_XTC_THRESHOLD, metadata.sampling_xtc_threshold)
|
||||
metadata.sampling_temp = metadata_override.get(Keys.General.SAMPLING_TEMP, metadata.sampling_temp)
|
||||
metadata.sampling_penalty_last_n = metadata_override.get(Keys.General.SAMPLING_PENALTY_LAST_N, metadata.sampling_penalty_last_n)
|
||||
metadata.sampling_penalty_repeat = metadata_override.get(Keys.General.SAMPLING_PENALTY_REPEAT, metadata.sampling_penalty_repeat)
|
||||
metadata.sampling_mirostat = metadata_override.get(Keys.General.SAMPLING_MIROSTAT, metadata.sampling_mirostat)
|
||||
metadata.sampling_mirostat_tau = metadata_override.get(Keys.General.SAMPLING_MIROSTAT_TAU, metadata.sampling_mirostat_tau)
|
||||
metadata.sampling_mirostat_eta = metadata_override.get(Keys.General.SAMPLING_MIROSTAT_ETA, metadata.sampling_mirostat_eta)
|
||||
|
||||
metadata.name = metadata_override.get(Keys.General.NAME, metadata.name)
|
||||
metadata.author = metadata_override.get(Keys.General.AUTHOR, metadata.author)
|
||||
metadata.version = metadata_override.get(Keys.General.VERSION, metadata.version)
|
||||
@@ -172,6 +214,23 @@ class Metadata:
|
||||
with open(config_path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
@staticmethod
|
||||
def load_generation_config(model_path: Optional[Path] = None) -> dict[str, Any]:
|
||||
if model_path is None or not model_path.is_dir():
|
||||
return {}
|
||||
|
||||
generation_config_path = model_path / "generation_config.json"
|
||||
|
||||
if not generation_config_path.is_file():
|
||||
return {}
|
||||
|
||||
try:
|
||||
with open(generation_config_path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
except (json.JSONDecodeError, IOError):
|
||||
# not all models have valid generation_config.json
|
||||
return {}
|
||||
|
||||
@staticmethod
|
||||
def id_to_title(string):
|
||||
# Convert capitalization into title form unless acronym or version number
|
||||
@@ -546,6 +605,32 @@ class Metadata:
|
||||
|
||||
def set_gguf_meta_model(self, gguf_writer: gguf.GGUFWriter):
|
||||
assert self.name is not None
|
||||
|
||||
if self.sampling_sequence is not None:
|
||||
gguf_writer.add_sampling_sequence(self.sampling_sequence)
|
||||
if self.sampling_top_k is not None:
|
||||
gguf_writer.add_sampling_top_k(self.sampling_top_k)
|
||||
if self.sampling_top_p is not None:
|
||||
gguf_writer.add_sampling_top_p(self.sampling_top_p)
|
||||
if self.sampling_min_p is not None:
|
||||
gguf_writer.add_sampling_min_p(self.sampling_min_p)
|
||||
if self.sampling_xtc_probability is not None:
|
||||
gguf_writer.add_sampling_xtc_probability(self.sampling_xtc_probability)
|
||||
if self.sampling_xtc_threshold is not None:
|
||||
gguf_writer.add_sampling_xtc_threshold(self.sampling_xtc_threshold)
|
||||
if self.sampling_temp is not None:
|
||||
gguf_writer.add_sampling_temp(self.sampling_temp)
|
||||
if self.sampling_penalty_last_n is not None:
|
||||
gguf_writer.add_sampling_penalty_last_n(self.sampling_penalty_last_n)
|
||||
if self.sampling_penalty_repeat is not None:
|
||||
gguf_writer.add_sampling_penalty_repeat(self.sampling_penalty_repeat)
|
||||
if self.sampling_mirostat is not None:
|
||||
gguf_writer.add_sampling_mirostat(self.sampling_mirostat)
|
||||
if self.sampling_mirostat_tau is not None:
|
||||
gguf_writer.add_sampling_mirostat_tau(self.sampling_mirostat_tau)
|
||||
if self.sampling_mirostat_eta is not None:
|
||||
gguf_writer.add_sampling_mirostat_eta(self.sampling_mirostat_eta)
|
||||
|
||||
gguf_writer.add_name(self.name)
|
||||
|
||||
if self.author is not None:
|
||||
|
||||
@@ -246,6 +246,21 @@ extern "C" {
|
||||
LLAMA_KV_OVERRIDE_TYPE_STR,
|
||||
};
|
||||
|
||||
enum llama_model_meta_key {
|
||||
LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE,
|
||||
LLAMA_MODEL_META_KEY_SAMPLING_TOP_K,
|
||||
LLAMA_MODEL_META_KEY_SAMPLING_TOP_P,
|
||||
LLAMA_MODEL_META_KEY_SAMPLING_MIN_P,
|
||||
LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY,
|
||||
LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD,
|
||||
LLAMA_MODEL_META_KEY_SAMPLING_TEMP,
|
||||
LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N,
|
||||
LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT,
|
||||
LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT,
|
||||
LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU,
|
||||
LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA,
|
||||
};
|
||||
|
||||
struct llama_model_kv_override {
|
||||
enum llama_model_kv_override_type tag;
|
||||
|
||||
@@ -518,6 +533,9 @@ extern "C" {
|
||||
// Get the number of metadata key/value pairs
|
||||
LLAMA_API int32_t llama_model_meta_count(const struct llama_model * model);
|
||||
|
||||
// Get sampling metadata key name. Returns nullptr if the key is invalid
|
||||
LLAMA_API const char * llama_model_meta_key_str(enum llama_model_meta_key key);
|
||||
|
||||
// Get metadata key name by index
|
||||
LLAMA_API int32_t llama_model_meta_key_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
|
||||
|
||||
|
||||
@@ -114,19 +114,31 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
};
|
||||
|
||||
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_GENERAL_TYPE, "general.type" },
|
||||
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
|
||||
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
|
||||
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
||||
{ LLM_KV_GENERAL_FILE_TYPE, "general.file_type" },
|
||||
{ LLM_KV_GENERAL_NAME, "general.name" },
|
||||
{ LLM_KV_GENERAL_AUTHOR, "general.author" },
|
||||
{ LLM_KV_GENERAL_VERSION, "general.version" },
|
||||
{ LLM_KV_GENERAL_URL, "general.url" },
|
||||
{ LLM_KV_GENERAL_DESCRIPTION, "general.description" },
|
||||
{ LLM_KV_GENERAL_LICENSE, "general.license" },
|
||||
{ LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
|
||||
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
|
||||
{ LLM_KV_GENERAL_TYPE, "general.type" },
|
||||
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
|
||||
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
|
||||
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
||||
{ LLM_KV_GENERAL_FILE_TYPE, "general.file_type" },
|
||||
{ LLM_KV_GENERAL_SAMPLING_SEQUENCE, "general.sampling.sequence" },
|
||||
{ LLM_KV_GENERAL_SAMPLING_TOP_K, "general.sampling.top_k" },
|
||||
{ LLM_KV_GENERAL_SAMPLING_TOP_P, "general.sampling.top_p" },
|
||||
{ LLM_KV_GENERAL_SAMPLING_MIN_P, "general.sampling.min_p" },
|
||||
{ LLM_KV_GENERAL_SAMPLING_XTC_PROBABILITY, "general.sampling.xtc_probability" },
|
||||
{ LLM_KV_GENERAL_SAMPLING_XTC_THRESHOLD, "general.sampling.xtc_threshold" },
|
||||
{ LLM_KV_GENERAL_SAMPLING_TEMP, "general.sampling.temp" },
|
||||
{ LLM_KV_GENERAL_SAMPLING_PENALTY_LAST_N, "general.sampling.penalty_last_n" },
|
||||
{ LLM_KV_GENERAL_SAMPLING_PENALTY_REPEAT, "general.sampling.penalty_repeat" },
|
||||
{ LLM_KV_GENERAL_SAMPLING_MIROSTAT, "general.sampling.mirostat" },
|
||||
{ LLM_KV_GENERAL_SAMPLING_MIROSTAT_TAU, "general.sampling.mirostat_tau" },
|
||||
{ LLM_KV_GENERAL_SAMPLING_MIROSTAT_ETA, "general.sampling.mirostat_eta" },
|
||||
{ LLM_KV_GENERAL_NAME, "general.name" },
|
||||
{ LLM_KV_GENERAL_AUTHOR, "general.author" },
|
||||
{ LLM_KV_GENERAL_VERSION, "general.version" },
|
||||
{ LLM_KV_GENERAL_URL, "general.url" },
|
||||
{ LLM_KV_GENERAL_DESCRIPTION, "general.description" },
|
||||
{ LLM_KV_GENERAL_LICENSE, "general.license" },
|
||||
{ LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
|
||||
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
|
||||
|
||||
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
|
||||
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
||||
|
||||
@@ -123,6 +123,18 @@ enum llm_kv {
|
||||
LLM_KV_GENERAL_QUANTIZATION_VERSION,
|
||||
LLM_KV_GENERAL_ALIGNMENT,
|
||||
LLM_KV_GENERAL_FILE_TYPE,
|
||||
LLM_KV_GENERAL_SAMPLING_SEQUENCE,
|
||||
LLM_KV_GENERAL_SAMPLING_TOP_K,
|
||||
LLM_KV_GENERAL_SAMPLING_TOP_P,
|
||||
LLM_KV_GENERAL_SAMPLING_MIN_P,
|
||||
LLM_KV_GENERAL_SAMPLING_XTC_PROBABILITY,
|
||||
LLM_KV_GENERAL_SAMPLING_XTC_THRESHOLD,
|
||||
LLM_KV_GENERAL_SAMPLING_TEMP,
|
||||
LLM_KV_GENERAL_SAMPLING_PENALTY_LAST_N,
|
||||
LLM_KV_GENERAL_SAMPLING_PENALTY_REPEAT,
|
||||
LLM_KV_GENERAL_SAMPLING_MIROSTAT,
|
||||
LLM_KV_GENERAL_SAMPLING_MIROSTAT_TAU,
|
||||
LLM_KV_GENERAL_SAMPLING_MIROSTAT_ETA,
|
||||
LLM_KV_GENERAL_NAME,
|
||||
LLM_KV_GENERAL_AUTHOR,
|
||||
LLM_KV_GENERAL_VERSION,
|
||||
|
||||
@@ -1248,7 +1248,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
|
||||
// make the outputs have the same order they had in the user-provided batch
|
||||
// note: this is mostly relevant for recurrent models atm
|
||||
if (!sorted_output) {
|
||||
if (!sorted_output && n_outputs > 1) {
|
||||
GGML_ASSERT((size_t) n_outputs == out_ids.size());
|
||||
|
||||
// TODO: is there something more efficient which also minimizes swaps?
|
||||
|
||||
@@ -7687,6 +7687,24 @@ int32_t llama_model_meta_count(const llama_model * model) {
|
||||
return (int)model->gguf_kv.size();
|
||||
}
|
||||
|
||||
const char * llama_model_meta_key_str(llama_model_meta_key key) {
|
||||
switch (key) {
|
||||
case LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE: return "general.sampling.sequence";
|
||||
case LLAMA_MODEL_META_KEY_SAMPLING_TOP_K: return "general.sampling.top_k";
|
||||
case LLAMA_MODEL_META_KEY_SAMPLING_TOP_P: return "general.sampling.top_p";
|
||||
case LLAMA_MODEL_META_KEY_SAMPLING_MIN_P: return "general.sampling.min_p";
|
||||
case LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY: return "general.sampling.xtc_probability";
|
||||
case LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD: return "general.sampling.xtc_threshold";
|
||||
case LLAMA_MODEL_META_KEY_SAMPLING_TEMP: return "general.sampling.temp";
|
||||
case LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N: return "general.sampling.penalty_last_n";
|
||||
case LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT: return "general.sampling.penalty_repeat";
|
||||
case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT: return "general.sampling.mirostat";
|
||||
case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU: return "general.sampling.mirostat_tau";
|
||||
case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA: return "general.sampling.mirostat_eta";
|
||||
default: return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
int32_t llama_model_meta_key_by_index(const llama_model * model, int i, char * buf, size_t buf_size) {
|
||||
if (i < 0 || i >= (int)model->gguf_kv.size()) {
|
||||
if (buf_size > 0) {
|
||||
|
||||
@@ -7859,6 +7859,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
||||
}
|
||||
}
|
||||
|
||||
// Qwen3-VL-8B https://github.com/ggml-org/llama.cpp/issues/17012
|
||||
test_cases.emplace_back(new test_flash_attn_ext(72, 72, 16, {1, 1}, 5776, 5776, false, false, 0, 0, GGML_PREC_F32, GGML_TYPE_F16));
|
||||
|
||||
for (int kv : { 4096, 8192, 16384, }) {
|
||||
for (int hs : { 64, 128, }) {
|
||||
for (int nr : { 1, 4, }) {
|
||||
|
||||
Binary file not shown.
@@ -8,6 +8,7 @@
|
||||
import rehypeKatex from 'rehype-katex';
|
||||
import rehypeStringify from 'rehype-stringify';
|
||||
import { copyCodeToClipboard } from '$lib/utils/copy';
|
||||
import { rehypeRestoreTableHtml } from '$lib/markdown/table-html-restorer';
|
||||
import { preprocessLaTeX } from '$lib/utils/latex-protection';
|
||||
import { browser } from '$app/environment';
|
||||
import '$styles/katex-custom.scss';
|
||||
@@ -60,6 +61,7 @@
|
||||
.use(remarkRehype) // Convert Markdown AST to rehype
|
||||
.use(rehypeKatex) // Render math using KaTeX
|
||||
.use(rehypeHighlight) // Add syntax highlighting
|
||||
.use(rehypeRestoreTableHtml) // Restore limited HTML (e.g., <br>, <ul>) inside Markdown tables
|
||||
.use(rehypeStringify); // Convert to HTML string
|
||||
});
|
||||
|
||||
|
||||
20
tools/server/webui/src/lib/constants/table-html-restorer.ts
Normal file
20
tools/server/webui/src/lib/constants/table-html-restorer.ts
Normal file
@@ -0,0 +1,20 @@
|
||||
/**
|
||||
* Matches <br>, <br/>, <br /> tags (case-insensitive).
|
||||
* Used to detect line breaks in table cell text content.
|
||||
*/
|
||||
export const BR_PATTERN = /<br\s*\/?\s*>/gi;
|
||||
|
||||
/**
|
||||
* Matches a complete <ul>...</ul> block.
|
||||
* Captures the inner content (group 1) for further <li> extraction.
|
||||
* Case-insensitive, allows multiline content.
|
||||
*/
|
||||
export const LIST_PATTERN = /^<ul>([\s\S]*)<\/ul>$/i;
|
||||
|
||||
/**
|
||||
* Matches individual <li>...</li> elements within a list.
|
||||
* Captures the inner content (group 1) of each list item.
|
||||
* Non-greedy to handle multiple consecutive items.
|
||||
* Case-insensitive, allows multiline content.
|
||||
*/
|
||||
export const LI_PATTERN = /<li>([\s\S]*?)<\/li>/gi;
|
||||
181
tools/server/webui/src/lib/markdown/table-html-restorer.ts
Normal file
181
tools/server/webui/src/lib/markdown/table-html-restorer.ts
Normal file
@@ -0,0 +1,181 @@
|
||||
/**
|
||||
* Rehype plugin to restore limited HTML elements inside Markdown table cells.
|
||||
*
|
||||
* ## Problem
|
||||
* The remark/rehype pipeline neutralizes inline HTML as literal text
|
||||
* (remarkLiteralHtml) so that XML/HTML snippets in LLM responses display
|
||||
* as-is instead of being rendered. This causes <br> and <ul> markup in
|
||||
* table cells to show as plain text.
|
||||
*
|
||||
* ## Solution
|
||||
* This plugin traverses the HAST post-conversion, parses whitelisted HTML
|
||||
* patterns from text nodes, and replaces them with actual HAST element nodes
|
||||
* that will be rendered as real HTML.
|
||||
*
|
||||
* ## Supported HTML
|
||||
* - `<br>` / `<br/>` / `<br />` - Line breaks (inline)
|
||||
* - `<ul><li>...</li></ul>` - Unordered lists (block)
|
||||
*
|
||||
* ## Key Implementation Details
|
||||
*
|
||||
* ### 1. Sibling Combination (Critical)
|
||||
* The Markdown pipeline may fragment content across multiple text nodes and `<br>`
|
||||
* elements. For example, `<ul><li>a</li></ul>` might arrive as:
|
||||
* - Text: `"<ul>"`
|
||||
* - Element: `<br>`
|
||||
* - Text: `"<li>a</li></ul>"`
|
||||
*
|
||||
* We must combine consecutive text nodes and `<br>` elements into a single string
|
||||
* before attempting to parse list markup. Without this, list detection fails.
|
||||
*
|
||||
* ### 2. visitParents for Deep Traversal
|
||||
* Table cell content may be wrapped in intermediate elements (e.g., `<p>` tags).
|
||||
* Using `visitParents` instead of direct child iteration ensures we find text
|
||||
* nodes at any depth within the cell.
|
||||
*
|
||||
* ### 3. Reference Comparison for No-Op Detection
|
||||
* When checking if `<br>` expansion changed anything, we compare:
|
||||
* `expanded.length !== 1 || expanded[0] !== textNode`
|
||||
*
|
||||
* This catches both cases:
|
||||
* - Multiple nodes created (text was split)
|
||||
* - Single NEW node created (original had only `<br>`, now it's an element)
|
||||
*
|
||||
* A simple `length > 1` check would miss the single `<br>` case.
|
||||
*
|
||||
* ### 4. Strict List Validation
|
||||
* `parseList()` rejects malformed markup by checking for garbage text between
|
||||
* `<li>` elements. This prevents creating broken DOM from partial matches like
|
||||
* `<ul>garbage<li>a</li></ul>`.
|
||||
*
|
||||
* ### 5. Newline Substitution for `<br>` in Combined String
|
||||
* When combining siblings, existing `<br>` elements become `\n` in the combined
|
||||
* string. This allows list content to span visual lines while still being parsed
|
||||
* as a single unit.
|
||||
*
|
||||
* @example
|
||||
* // Input Markdown:
|
||||
* // | Feature | Notes |
|
||||
* // |---------|-------|
|
||||
* // | Multi-line | First<br>Second |
|
||||
* // | List | <ul><li>A</li><li>B</li></ul> |
|
||||
* //
|
||||
* // Without this plugin: <br> and <ul> render as literal text
|
||||
* // With this plugin: <br> becomes line break, <ul> becomes actual list
|
||||
*/
|
||||
|
||||
import type { Plugin } from 'unified';
|
||||
import type { Element, ElementContent, Root, Text } from 'hast';
|
||||
import { visit } from 'unist-util-visit';
|
||||
import { visitParents } from 'unist-util-visit-parents';
|
||||
import { BR_PATTERN, LIST_PATTERN, LI_PATTERN } from '$lib/constants/table-html-restorer';
|
||||
|
||||
/**
|
||||
* Expands text containing `<br>` tags into an array of text nodes and br elements.
|
||||
*/
|
||||
function expandBrTags(value: string): ElementContent[] {
|
||||
const matches = [...value.matchAll(BR_PATTERN)];
|
||||
if (!matches.length) return [{ type: 'text', value } as Text];
|
||||
|
||||
const result: ElementContent[] = [];
|
||||
let cursor = 0;
|
||||
|
||||
for (const m of matches) {
|
||||
if (m.index! > cursor) {
|
||||
result.push({ type: 'text', value: value.slice(cursor, m.index) } as Text);
|
||||
}
|
||||
result.push({ type: 'element', tagName: 'br', properties: {}, children: [] } as Element);
|
||||
cursor = m.index! + m[0].length;
|
||||
}
|
||||
|
||||
if (cursor < value.length) {
|
||||
result.push({ type: 'text', value: value.slice(cursor) } as Text);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses a `<ul><li>...</li></ul>` string into a HAST element.
|
||||
* Returns null if the markup is malformed or contains unexpected content.
|
||||
*/
|
||||
function parseList(value: string): Element | null {
|
||||
const match = value.trim().match(LIST_PATTERN);
|
||||
if (!match) return null;
|
||||
|
||||
const body = match[1];
|
||||
const items: ElementContent[] = [];
|
||||
let cursor = 0;
|
||||
|
||||
for (const liMatch of body.matchAll(LI_PATTERN)) {
|
||||
// Reject if there's non-whitespace between list items
|
||||
if (body.slice(cursor, liMatch.index!).trim()) return null;
|
||||
|
||||
items.push({
|
||||
type: 'element',
|
||||
tagName: 'li',
|
||||
properties: {},
|
||||
children: expandBrTags(liMatch[1] ?? '')
|
||||
} as Element);
|
||||
|
||||
cursor = liMatch.index! + liMatch[0].length;
|
||||
}
|
||||
|
||||
// Reject if no items found or trailing garbage exists
|
||||
if (!items.length || body.slice(cursor).trim()) return null;
|
||||
|
||||
return { type: 'element', tagName: 'ul', properties: {}, children: items } as Element;
|
||||
}
|
||||
|
||||
/**
|
||||
* Processes a single table cell, restoring HTML elements from text content.
|
||||
*/
|
||||
function processCell(cell: Element) {
|
||||
visitParents(cell, 'text', (textNode: Text, ancestors) => {
|
||||
const parent = ancestors[ancestors.length - 1];
|
||||
if (!parent || parent.type !== 'element') return;
|
||||
|
||||
const parentEl = parent as Element;
|
||||
const siblings = parentEl.children as ElementContent[];
|
||||
const startIndex = siblings.indexOf(textNode as ElementContent);
|
||||
if (startIndex === -1) return;
|
||||
|
||||
// Combine consecutive text nodes and <br> elements into one string
|
||||
let combined = '';
|
||||
let endIndex = startIndex;
|
||||
|
||||
for (let i = startIndex; i < siblings.length; i++) {
|
||||
const sib = siblings[i];
|
||||
if (sib.type === 'text') {
|
||||
combined += (sib as Text).value;
|
||||
endIndex = i;
|
||||
} else if (sib.type === 'element' && (sib as Element).tagName === 'br') {
|
||||
combined += '\n';
|
||||
endIndex = i;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Try parsing as list first (replaces entire combined range)
|
||||
const list = parseList(combined);
|
||||
if (list) {
|
||||
siblings.splice(startIndex, endIndex - startIndex + 1, list);
|
||||
return;
|
||||
}
|
||||
|
||||
// Otherwise, just expand <br> tags in this text node
|
||||
const expanded = expandBrTags(textNode.value);
|
||||
if (expanded.length !== 1 || expanded[0] !== textNode) {
|
||||
siblings.splice(startIndex, 1, ...expanded);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
export const rehypeRestoreTableHtml: Plugin<[], Root> = () => (tree) => {
|
||||
visit(tree, 'element', (node: Element) => {
|
||||
if (node.tagName === 'td' || node.tagName === 'th') {
|
||||
processCell(node);
|
||||
}
|
||||
});
|
||||
};
|
||||
Reference in New Issue
Block a user