Compare commits

...

13 Commits

Author SHA1 Message Date
Ruben Ortlam
46e824a9f0 use text file instead of json 2026-03-02 13:56:14 +01:00
Ruben Ortlam
fa3e83dbac fix inplace error 2026-02-28 11:42:04 +01:00
Ruben Ortlam
4980707be0 add name to tests 2026-02-25 15:56:40 +01:00
Ruben Ortlam
bac39a1671 fix nb[0] not getting set for view 2026-02-25 15:56:40 +01:00
Ruben Ortlam
7f6deaaf71 reduce test description length 2026-02-25 15:56:40 +01:00
Ruben Ortlam
9c22db0167 replace internal API calls with public llama_graph_reserve call 2026-02-25 15:56:40 +01:00
Ruben Ortlam
2720d4693f only use view if non-contiguous/permuted, use C++ random instead of rand() 2026-02-25 15:56:40 +01:00
Ruben Ortlam
cb39bdf3ce fix view check 2026-02-25 15:56:40 +01:00
Ruben Ortlam
2a8960344d add nb parameter for non-contiguous input tensors 2026-02-25 15:56:40 +01:00
Ruben Ortlam
850df0c0a4 add graph operator json extraction tool 2026-02-25 15:56:40 +01:00
Ruben Ortlam
f562037b4f add error when file cannot be read 2026-02-25 15:56:40 +01:00
Ruben Ortlam
03ba1ff09d add error threshold based on op 2026-02-25 15:56:40 +01:00
Ruben Ortlam
b10775a35e tests: allow loading test-backend-ops tests from json 2026-02-25 15:56:40 +01:00
8 changed files with 528 additions and 13 deletions

View File

@@ -2642,7 +2642,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.out_file = value;
}
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA,
LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS}));
add_opt(common_arg(
{"-ofreq", "--output-frequency"}, "N",
string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),

View File

@@ -104,6 +104,7 @@ enum llama_example {
LLAMA_EXAMPLE_DIFFUSION,
LLAMA_EXAMPLE_FINETUNE,
LLAMA_EXAMPLE_FIT_PARAMS,
LLAMA_EXAMPLE_EXPORT_GRAPH_OPS,
LLAMA_EXAMPLE_COUNT,
};

View File

@@ -617,6 +617,13 @@ extern "C" {
const char * fname_out,
const llama_model_quantize_params * params);
// Reserve a new compute graph. It is valid until the next call to llama_graph_reserve.
LLAMA_API struct ggml_cgraph * llama_graph_reserve(
struct llama_context * ctx,
uint32_t n_tokens,
uint32_t n_seqs,
uint32_t n_outputs);
//
// Adapters
//

View File

@@ -3035,6 +3035,19 @@ uint32_t llama_get_sampled_probs_count_ith(llama_context * ctx, int32_t i) {
return static_cast<uint32_t>(ctx->get_sampled_probs_count(i));
}
struct ggml_cgraph * llama_graph_reserve(
struct llama_context * ctx,
uint32_t n_tokens,
uint32_t n_seqs,
uint32_t n_outputs) {
auto * memory = ctx->get_memory();
llama_memory_context_ptr mctx;
if (memory) {
mctx = memory->init_full();
}
return ctx->graph_reserve(n_tokens, n_seqs, n_outputs, mctx.get());
}
// llama adapter API
int32_t llama_set_adapters_lora(

View File

@@ -31,10 +31,12 @@
#include <cstring>
#include <ctime>
#include <future>
#include <fstream>
#include <memory>
#include <random>
#include <regex>
#include <set>
#include <sstream>
#include <string>
#include <string_view>
#include <thread>
@@ -6594,6 +6596,236 @@ struct test_diag : public test_case {
}
};
// Deserializable generic test case
struct input_tensor {
ggml_type type;
std::array<int64_t, 4> ne;
std::array<size_t, 4> nb; // strides (0 = use default contiguous strides)
};
static bool is_non_contiguous(const input_tensor & src) {
if (src.nb[0] == 0) {
return false;
}
const size_t default_nb0 = ggml_type_size(src.type);
const size_t default_nb1 = default_nb0 * (src.ne[0] / ggml_blck_size(src.type));
const size_t default_nb2 = default_nb1 * src.ne[1];
const size_t default_nb3 = default_nb2 * src.ne[2];
return src.nb[0] != default_nb0 ||
src.nb[1] != default_nb1 ||
src.nb[2] != default_nb2 ||
src.nb[3] != default_nb3;
}
static std::string var_to_str(const std::vector<input_tensor>& sources) {
std::ostringstream oss;
bool first = true;
for (const auto& src : sources) {
if (!first) oss << ",";
oss << ggml_type_name(src.type) << "[" << src.ne[0] << "," << src.ne[1] << "," << src.ne[2] << "," << src.ne[3] << "]";
if (is_non_contiguous(src)) {
oss << "nb[" << src.nb[0] << "," << src.nb[1] << "," << src.nb[2] << "," << src.nb[3] << "]";
}
first = false;
}
return oss.str();
}
static std::string var_to_str(const std::array<int32_t, GGML_MAX_OP_PARAMS / sizeof(int32_t)>& params) {
std::ostringstream oss;
oss << "[";
bool first = true;
for (size_t i = 0; i < params.size(); ++i) {
if (params[i] != 0) {
if (!first) oss << ",";
oss << i << ":" << params[i];
first = false;
}
}
oss << "]";
return oss.str();
}
struct test_generic_op : public test_case {
const ggml_op op;
const ggml_type type;
const std::array<int64_t, 4> ne;
const std::array<int32_t, GGML_MAX_OP_PARAMS / sizeof(int32_t)> op_params;
const std::vector<input_tensor> sources;
const std::string name;
std::string vars() override {
if (name.empty()) {
return VARS_TO_STR4(type, ne, op_params, sources);
}
return VARS_TO_STR5(name, type, ne, op_params, sources);
}
test_generic_op(ggml_op op, ggml_type type, std::array<int64_t, 4> ne,
std::array<int32_t, GGML_MAX_OP_PARAMS / sizeof(int32_t)> op_params,
std::vector<input_tensor> sources, std::string name = "")
: op(op), type(type), ne(ne), op_params(op_params), sources(sources), name(std::move(name)) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
const size_t source_count = std::min(sources.size(), (size_t)GGML_MAX_SRC);
std::array<ggml_tensor *, GGML_MAX_SRC> source_tensors;
for (size_t i = 0; i < source_count; ++i) {
const input_tensor& src = sources[i];
if (is_non_contiguous(src)) {
size_t total_size;
const size_t blck_size = ggml_blck_size(src.type);
if (blck_size == 1) {
total_size = ggml_type_size(src.type);
for (int d = 0; d < 4; d++) {
total_size += (src.ne[d] - 1) * src.nb[d];
}
} else {
total_size = src.ne[0] * src.nb[0] / blck_size;
for (int d = 1; d < 4; d++) {
total_size += (src.ne[d] - 1) * src.nb[d];
}
}
// Convert bytes to elements, padded to block size for quantized types
const size_t type_size = ggml_type_size(src.type);
size_t backing_elements = (total_size * blck_size + type_size - 1) / type_size;
backing_elements = ((backing_elements + blck_size - 1) / blck_size) * blck_size;
ggml_tensor * backing = ggml_new_tensor_1d(ctx, src.type, backing_elements);
source_tensors[i] = ggml_view_4d(ctx, backing,
src.ne[0], src.ne[1], src.ne[2], src.ne[3],
src.nb[1], src.nb[2], src.nb[3], 0);
// nb[0] does not get set by view_4d, so set it manually
source_tensors[i]->nb[0] = src.nb[0];
} else {
source_tensors[i] = ggml_new_tensor_4d(ctx, src.type, src.ne[0], src.ne[1], src.ne[2], src.ne[3]);
}
}
// Ops with an inplace flag create a view of src[0] as their output.
bool inplace = false;
if (op == GGML_OP_SET || op == GGML_OP_ACC) {
inplace = op_params[4] != 0;
} else if (op == GGML_OP_ADD_REL_POS) {
inplace = op_params[0] != 0;
}
ggml_tensor * out;
if (inplace && source_count > 0) {
out = ggml_view_tensor(ctx, source_tensors[0]);
} else {
out = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
}
out->op = op;
for (size_t i = 0; i < source_count; ++i) {
out->src[i] = source_tensors[i];
}
memcpy(out->op_params, op_params.data(), GGML_MAX_OP_PARAMS);
ggml_set_name(out, "out");
return out;
}
double max_nmse_err() override {
switch (op) {
case GGML_OP_MUL_MAT:
case GGML_OP_MUL_MAT_ID:
case GGML_OP_OUT_PROD:
case GGML_OP_CONV_TRANSPOSE_2D:
case GGML_OP_IM2COL:
case GGML_OP_CONV_2D:
case GGML_OP_CONV_3D:
case GGML_OP_SET_ROWS:
case GGML_OP_CPY:
return 5e-4;
case GGML_OP_SOFT_MAX:
return 1e-6;
case GGML_OP_RWKV_WKV7:
return 5e-3;
case GGML_OP_FLASH_ATTN_EXT:
{
// Scale error with kv length to account for accumulating floating point error
const int64_t kv = sources[1].ne[1];
return 5e-4 * std::max(1.0, kv / 20000.0);
}
default:
return 1e-7;
}
}
void initialize_tensors(ggml_context * ctx) override {
ggml_tensor * out = ggml_get_tensor(ctx, "out");
std::random_device rd;
std::default_random_engine rng(rd());
for (size_t i = 0; i < sources.size() && i < GGML_MAX_SRC; i++) {
ggml_tensor * t = out->src[i];
if (!t) {
break;
}
// FLASH_ATTN_EXT: src[3] is the KQ mask
if (op == GGML_OP_FLASH_ATTN_EXT && i == 3) {
init_tensor_kq_mask(t);
continue;
}
if (t->type == GGML_TYPE_I32 || t->type == GGML_TYPE_I64) {
if (op == GGML_OP_GET_ROWS || op == GGML_OP_GET_ROWS_BACK) {
const int64_t num_rows = sources[0].ne[1];
const int64_t nels = ggml_nelements(t);
std::vector<int32_t> data(nels);
std::uniform_int_distribution<int32_t> dist(0, num_rows - 1);
for (int64_t i = 0; i < nels; i++) {
data[i] = dist(rng);
}
ggml_backend_tensor_set(t, data.data(), 0, nels * sizeof(int32_t));
} else if (op == GGML_OP_SET_ROWS) {
init_set_rows_row_ids(t, ne[1]);
} else if (op == GGML_OP_ROPE) {
const int mode = op_params[2];
const int64_t nels = (mode & GGML_ROPE_TYPE_MROPE) ? ne[2] * 4 : ne[2];
std::vector<int32_t> data(nels);
std::uniform_int_distribution<int32_t> dist(0, ne[2] - 1);
for (int64_t i = 0; i < nels; i++) {
data[i] = dist(rng);
}
ggml_backend_tensor_set(t, data.data(), 0, nels * sizeof(int32_t));
} else if (op == GGML_OP_MUL_MAT_ID || op == GGML_OP_ADD_ID) {
const int64_t n_expert = (op == GGML_OP_MUL_MAT_ID) ? sources[0].ne[2] : sources[1].ne[1];
for (int64_t r = 0; r < ggml_nrows(t); r++) {
std::vector<int32_t> data(t->ne[0]);
for (int32_t i = 0; i < t->ne[0]; i++) {
data[i] = i % n_expert;
}
std::shuffle(data.begin(), data.end(), rng);
ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t));
}
} else if (op == GGML_OP_SSM_SCAN) {
for (int64_t r = 0; r < ggml_nrows(t); r++) {
std::vector<int32_t> data(t->ne[0]);
for (int32_t i = 0; i < t->ne[0]; i++) {
data[i] = i;
}
std::shuffle(data.begin(), data.end(), rng);
ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t));
}
} else {
init_tensor_uniform(t);
}
} else {
init_tensor_uniform(t);
}
}
}
};
enum llm_norm_type {
LLM_NORM,
@@ -8653,8 +8885,72 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
return test_cases;
}
static std::vector<std::unique_ptr<test_case>> make_test_cases_from_file(const char * path) {
std::ifstream f(path);
if (!f.is_open()) {
throw std::runtime_error("Unable to read test file");
}
std::vector<std::unique_ptr<test_case>> test_cases;
std::string line;
while (std::getline(f, line)) {
std::istringstream iss(line);
ggml_op op;
ggml_type type;
std::array<int64_t, 4> ne;
std::array<int32_t, GGML_MAX_OP_PARAMS / sizeof(int32_t)> op_params = {};
std::string name;
uint64_t tmp;
iss >> tmp;
op = (ggml_op)tmp;
iss >> tmp;
type = (ggml_type)tmp;
for (size_t i = 0; i < 4; i++) {
iss >> ne[i];
}
iss >> tmp;
for (size_t i = 0; i < tmp && i < op_params.size(); i++) {
iss >> op_params[i];
}
iss >> tmp;
size_t num_src = std::min((uint64_t)GGML_MAX_SRC, tmp);
std::vector<input_tensor> sources(num_src);
for (size_t i = 0; i < num_src; i++) {
input_tensor& src = sources[i];
iss >> tmp;
src.type = (ggml_type)tmp;
for (size_t i = 0; i < 4; i++) {
iss >> src.ne[i];
}
for (size_t i = 0; i < 4; i++) {
iss >> src.nb[i];
}
}
iss >> name;
if (name.length() == 1 && name[0] == '-') {
name = "";
}
test_cases.emplace_back(new test_generic_op(op, type, ne, op_params, sources, std::move(name)));
}
return test_cases;
}
static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_names_filter, const char * params_filter,
printer * output_printer) {
printer * output_printer, const char * test_file_path) {
auto filter_test_cases = [](std::vector<std::unique_ptr<test_case>> & test_cases, const char * params_filter) {
if (params_filter == nullptr) {
return;
@@ -8672,9 +8968,26 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
}
};
std::vector<std::unique_ptr<test_case>> test_cases;
if (test_file_path == nullptr) {
switch (mode) {
case MODE_TEST:
case MODE_GRAD:
case MODE_SUPPORT:
test_cases = make_test_cases_eval();
break;
case MODE_PERF:
test_cases = make_test_cases_perf();
break;
}
} else {
test_cases = make_test_cases_from_file(test_file_path);
}
filter_test_cases(test_cases, params_filter);
if (mode == MODE_TEST) {
auto test_cases = make_test_cases_eval();
filter_test_cases(test_cases, params_filter);
ggml_backend_t backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL);
if (backend_cpu == NULL) {
test_operation_info info("", "", "CPU");
@@ -8714,8 +9027,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
}
if (mode == MODE_GRAD) {
auto test_cases = make_test_cases_eval();
filter_test_cases(test_cases, params_filter);
size_t n_ok = 0;
for (auto & test : test_cases) {
if (test->eval_grad(backend, op_names_filter, output_printer)) {
@@ -8728,8 +9039,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
}
if (mode == MODE_PERF) {
auto test_cases = make_test_cases_perf();
filter_test_cases(test_cases, params_filter);
for (auto & test : test_cases) {
test->eval_perf(backend, op_names_filter, output_printer);
}
@@ -8737,9 +9046,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
}
if (mode == MODE_SUPPORT) {
auto test_cases = make_test_cases_eval();
filter_test_cases(test_cases, params_filter);
// Filter out fusion cases
test_cases.erase(
std::remove_if(test_cases.begin(), test_cases.end(), [](const std::unique_ptr<test_case> & tc) {
@@ -8858,7 +9164,8 @@ static void show_test_coverage() {
}
static void usage(char ** argv) {
printf("Usage: %s [mode] [-o <op,..>] [-b <backend>] [-p <params regex>] [--output <console|sql|csv>] [--list-ops] [--show-coverage]\n", argv[0]);
printf("Usage: %s [mode] [-o <op,..>] [-b <backend>] [-p <params regex>] [--output <console|sql|csv>] [--list-ops]", argv[0]);
printf(" [--show-coverage] [--test-file <path>]\n");
printf(" valid modes:\n");
printf(" - test (default, compare with CPU backend for correctness)\n");
printf(" - grad (compare gradients from backpropagation with method of finite differences)\n");
@@ -8869,6 +9176,7 @@ static void usage(char ** argv) {
printf(" --output specifies output format (default: console, options: console, sql, csv)\n");
printf(" --list-ops lists all available GGML operations\n");
printf(" --show-coverage shows test coverage\n");
printf(" --test-file reads test operators from a test file generated by llama-export-graph-ops\n");
}
int main(int argc, char ** argv) {
@@ -8877,6 +9185,7 @@ int main(int argc, char ** argv) {
const char * op_names_filter = nullptr;
const char * backend_filter = nullptr;
const char * params_filter = nullptr;
const char * test_file_path = nullptr;
for (int i = 1; i < argc; i++) {
if (strcmp(argv[i], "test") == 0) {
@@ -8924,6 +9233,13 @@ int main(int argc, char ** argv) {
} else if (strcmp(argv[i], "--show-coverage") == 0) {
show_test_coverage();
return 0;
} else if (strcmp(argv[i], "--test-file") == 0) {
if (i + 1 < argc) {
test_file_path = argv[++i];
} else {
usage(argv);
return 1;
}
} else {
usage(argv);
return 1;
@@ -8976,7 +9292,7 @@ int main(int argc, char ** argv) {
false, "", ggml_backend_dev_description(dev),
total / 1024 / 1024, free / 1024 / 1024, true));
bool ok = test_backend(backend, mode, op_names_filter, params_filter, output_printer.get());
bool ok = test_backend(backend, mode, op_names_filter, params_filter, output_printer.get(), test_file_path);
if (ok) {
n_ok++;

View File

@@ -37,4 +37,5 @@ else()
add_subdirectory(export-lora)
endif()
add_subdirectory(fit-params)
add_subdirectory(export-graph-ops)
endif()

View File

@@ -0,0 +1,8 @@
set(TARGET llama-export-graph-ops)
add_executable(${TARGET} export-graph-ops.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
if(LLAMA_TOOLS_INSTALL)
install(TARGETS ${TARGET} RUNTIME)
endif()

View File

@@ -0,0 +1,168 @@
#include "arg.h"
#include "common.h"
#include "log.h"
#include "llama.h"
#include "ggml.h"
#include <array>
#include <vector>
#include <set>
#include <fstream>
#include <iostream>
struct input_tensor {
ggml_type type;
std::array<int64_t, 4> ne;
std::array<size_t, 4> nb;
input_tensor(ggml_type type, int64_t * ne, size_t * nb): type(type) {
memcpy(this->ne.data(), ne, 4 * sizeof(int64_t));
memcpy(this->nb.data(), nb, 4 * sizeof(size_t));
}
bool operator<(const input_tensor &b) const {
return std::tie(type, ne, nb) <
std::tie(b.type, b.ne, b.nb);
}
void serialize(std::ostream& out) const {
out << type << ' ';
for (size_t i = 0; i < 4; i++) {
out << ne[i] << ' ';
}
for (size_t i = 0; i < 4; i++) {
out << nb[i] << ' ';
}
}
};
struct test_object {
ggml_op op;
ggml_type type;
std::array<int64_t, 4> ne;
std::vector<int32_t> op_params;
std::vector<input_tensor> sources;
std::string name;
void serialize(std::ostream& out) const {
out << op << ' ' << type << ' ';
for (size_t i = 0; i < 4; i++) {
out << ne[i] << ' ';
}
out << op_params.size() << ' ';
for (size_t i = 0; i < op_params.size(); i++) {
out << op_params[i] << ' ';
}
out << sources.size() << ' ';
for (size_t s = 0; s < sources.size(); s++) {
sources[s].serialize(out);
}
if (!name.empty()) {
out << name;
} else {
out << '-';
}
out << '\n';
}
bool operator<(const test_object &b) const {
return std::tie(op, type, ne, op_params, sources) <
std::tie(b.op, b.type, b.ne, b.op_params, b.sources);
}
};
static void extract_graph_ops(ggml_cgraph * cgraph, const char * label, std::set<test_object> & tests) {
int n_nodes = ggml_graph_n_nodes(cgraph);
int n_skipped = 0;
int n_before = (int) tests.size();
for (int i = 0; i < n_nodes; i++) {
ggml_tensor * node = ggml_graph_node(cgraph, i);
if (node->op == GGML_OP_NONE || node->op == GGML_OP_VIEW || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE) {
n_skipped++;
continue;
}
test_object test;
test.op = node->op;
test.type = node->type;
memcpy(&test.ne, node->ne, 4 * sizeof(int64_t));
test.op_params.resize(GGML_MAX_OP_PARAMS / sizeof(int32_t));
memcpy(test.op_params.data(), node->op_params, GGML_MAX_OP_PARAMS);
for (size_t s = 0; s < GGML_MAX_SRC; s++) {
if (node->src[s] == nullptr) {
break;
}
test.sources.emplace_back(node->src[s]->type, node->src[s]->ne, node->src[s]->nb);
}
test.name = node->name;
tests.insert(test);
}
int n_new = (int) tests.size() - n_before;
LOG_INF("%s: %d unique ops, %d total nodes, %d skipped (view ops)\n",
label, n_new, n_nodes, n_skipped);
}
int main(int argc, char ** argv) {
common_params params;
params.out_file = "tests.txt";
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS)) {
return 1;
}
common_init();
// Load CPU-only
ggml_backend_dev_t cpu_device = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
params.devices = { cpu_device, nullptr };
params.fit_params = false;
params.n_gpu_layers = 0;
params.warmup = false;
auto init_result = common_init_from_params(params);
llama_context * ctx = init_result->context();
const uint32_t n_seqs = llama_n_seq_max(ctx);
const uint32_t n_tokens = std::min(llama_n_ctx(ctx), llama_n_ubatch(ctx));
std::set<test_object> tests;
auto * gf_pp = llama_graph_reserve(ctx, n_tokens, n_seqs, n_tokens);
if (!gf_pp) {
throw std::runtime_error("failed to reserve prompt processing graph");
}
extract_graph_ops(gf_pp, "pp", tests);
auto * gf_tg = llama_graph_reserve(ctx, n_seqs, n_seqs, n_seqs);
if (!gf_tg) {
throw std::runtime_error("failed to reserve token generation graph");
}
extract_graph_ops(gf_tg, "tg", tests);
LOG_INF("%d unique ops total\n", (int) tests.size());
std::ofstream f(params.out_file);
if (!f.is_open()) {
throw std::runtime_error("Unable to open output file");
}
for (const auto& test : tests) {
test.serialize(f);
}
return 0;
}