Compare commits

...

3 Commits

Author SHA1 Message Date
Georgi Gerganov
bf8b39015f metal : reuse graphs
ggml-ci
2025-07-07 21:37:07 +03:00
Georgi Gerganov
0d2038f90a llama-bench : add graph reuse parameter
ggml-ci
2025-07-07 09:07:15 +03:00
Georgi Gerganov
76681e3c73 llama : reuse compute graphs
ggml-ci
2025-07-05 15:18:47 +03:00
19 changed files with 701 additions and 247 deletions

View File

@@ -1464,6 +1464,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.swa_full = true;
}
).set_env("LLAMA_ARG_SWA_FULL"));
add_opt(common_arg(
{"--graph-reuse", "-gr"},
string_format("reuse previous compute graphs when possible (default: %s)"
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/14482)", params.graph_reuse ? "true" : "false"),
[](common_params & params) {
params.graph_reuse = true;
}
).set_env("LLAMA_ARG_GRAPH_REUSE"));
add_opt(common_arg(
{"--no-context-shift"},
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),

View File

@@ -1157,6 +1157,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
cparams.no_perf = params.no_perf;
cparams.op_offload = !params.no_op_offload;
cparams.swa_full = params.swa_full;
cparams.graph_reuse = params.graph_reuse;
cparams.type_k = params.cache_type_k;
cparams.type_v = params.cache_type_v;

View File

@@ -330,6 +330,7 @@ struct common_params {
bool no_perf = false; // disable performance metrics
bool ctx_shift = true; // context shift on inifinite text generation
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
bool graph_reuse = false; // reuse previous compute graphs when possible
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool use_mmap = true; // use mmap for faster loads

View File

@@ -821,13 +821,23 @@ struct ggml_backend_metal_context {
// the callback given to the thread pool
void (^encode_async)(size_t ith);
void (^encode_next)(void);
// n_cb command buffers + 1 used by the main thread
struct ggml_metal_command_buffer cmd_bufs[GGML_METAL_MAX_COMMAND_BUFFERS + 1];
struct ggml_metal_command_buffer cmd_bufs_next[2];
// abort ggml_metal_graph_compute if callback returns true
ggml_abort_callback abort_callback;
void * abort_callback_data;
// reuse info
int i_next;
int n_nodes_max;
int n_nodes_prev;
struct ggml_tensor * cg_nodes;
};
// MSL code
@@ -1084,6 +1094,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
ctx->gf = nil;
ctx->encode_async = nil;
ctx->encode_next = nil;
for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
ctx->cmd_bufs[i].obj = nil;
@@ -1091,6 +1102,13 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
ctx->cmd_bufs[i].mem_pool->device = device;
}
for (int i = 0; i < 2; ++i) {
ctx->cmd_bufs_next[i].obj = nil;
ctx->cmd_bufs_next[i].mem_pool = ggml_metal_mem_pool_init();
ctx->cmd_bufs_next[i].mem_pool->device = device;
}
#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
if (@available(macOS 10.12, iOS 16.0, *)) {
GGML_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, device.recommendedMaxWorkingSetSize / 1e6);
@@ -1521,6 +1539,13 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32, pool_2d_max_f32, true);
}
ctx->i_next = 0;
ctx->n_nodes_max = 16384;
ctx->n_nodes_prev = -1;
ctx->cg_nodes = ggml_aligned_malloc(ctx->n_nodes_max * sizeof(struct ggml_tensor));
return ctx;
}
@@ -1532,6 +1557,7 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
}
Block_release(ctx->encode_async);
Block_release(ctx->encode_next);
[ctx->queue release];
@@ -1541,8 +1567,13 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
ggml_metal_mem_pool_free(ctx->cmd_bufs[i].mem_pool);
}
ggml_metal_mem_pool_free(ctx->cmd_bufs_next[0].mem_pool);
ggml_metal_mem_pool_free(ctx->cmd_bufs_next[1].mem_pool);
dispatch_release(ctx->d_queue);
ggml_aligned_free(ctx->cg_nodes, ctx->n_nodes_max * sizeof(struct ggml_tensor));
free(ctx);
}
@@ -5448,6 +5479,39 @@ static enum ggml_status ggml_metal_graph_compute(
struct ggml_backend_metal_context * ctx = backend->context;
struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
//const int64_t t_start = ggml_time_us();
/////////////////////////////////////////////////////
// hacky way to determine that the graph is the same as the previous one
//
bool can_reuse = true;
if (gf->n_nodes > ctx->n_nodes_max) {
can_reuse = false;
}
if (gf->n_nodes != ctx->n_nodes_prev) {
can_reuse = false;
}
if (can_reuse) {
for (int i = 0; i < gf->n_nodes; ++i) {
if (memcmp(gf->nodes[i], ctx->cg_nodes + i, sizeof(struct ggml_tensor)) != 0) {
can_reuse = false;
break;
}
}
}
if (!can_reuse) {
ctx->n_nodes_prev = gf->n_nodes;
for (int i = 0; i < gf->n_nodes; ++i) {
memcpy(ctx->cg_nodes + i, gf->nodes[i], sizeof(struct ggml_tensor));
}
}
//////////////////////////////////////////////////////
// number of nodes encoded by the main thread (empirically determined)
const int n_main = 128;
@@ -5492,78 +5556,126 @@ static enum ggml_status ggml_metal_graph_compute(
}
}
// the main thread commits the first few commands immediately
// cmd_buf[n_cb]
{
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
ctx->cmd_bufs[n_cb].obj = cmd_buf;
if (!can_reuse) {
// the main thread commits the first few commands immediately
// cmd_buf[n_cb]
{
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
ctx->cmd_bufs[n_cb].obj = cmd_buf;
[cmd_buf enqueue];
ctx->encode_async(n_cb);
}
// prepare the rest of the command buffers asynchronously
// cmd_buf[0.. n_cb)
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
ctx->cmd_bufs[cb_idx].obj = cmd_buf;
// always enqueue the first two command buffers
// enqueue all of the command buffers if we don't need to abort
if (cb_idx < 2 || ctx->abort_callback == NULL) {
[cmd_buf enqueue];
ctx->encode_async(n_cb);
}
}
dispatch_apply(n_cb, ctx->d_queue, ctx->encode_async);
// prepare the rest of the command buffers asynchronously
// cmd_buf[0.. n_cb)
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
ctx->cmd_bufs[cb_idx].obj = cmd_buf;
// wait for completion and check status of each command buffer
// needed to detect if the device ran out-of-memory for example (#1881)
{
id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[n_cb].obj;
[cmd_buf waitUntilCompleted];
// always enqueue the first two command buffers
// enqueue all of the command buffers if we don't need to abort
if (cb_idx < 2 || ctx->abort_callback == NULL) {
[cmd_buf enqueue];
}
}
MTLCommandBufferStatus status = [cmd_buf status];
if (status != MTLCommandBufferStatusCompleted) {
GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status);
if (status == MTLCommandBufferStatusError) {
GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
dispatch_apply(n_cb, ctx->d_queue, ctx->encode_async);
// encode the command buffer for the next iter while the GPU has already started
{
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
[cmd_buf retain];
if (ctx->cmd_bufs_next[ctx->i_next].obj != nil) {
[ctx->cmd_bufs_next[ctx->i_next].obj release];
}
ctx->cmd_bufs_next[ctx->i_next].obj = cmd_buf;
ctx->encode_next();
}
// wait for completion and check status of each command buffer
// needed to detect if the device ran out-of-memory for example (#1881)
{
id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[n_cb].obj;
[cmd_buf waitUntilCompleted];
MTLCommandBufferStatus status = [cmd_buf status];
if (status != MTLCommandBufferStatusCompleted) {
GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status);
if (status == MTLCommandBufferStatusError) {
GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
}
return GGML_STATUS_FAILED;
}
}
for (int i = 0; i < n_cb; ++i) {
id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[i].obj;
[cmd_buf waitUntilCompleted];
MTLCommandBufferStatus status = [cmd_buf status];
if (status != MTLCommandBufferStatusCompleted) {
GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
if (status == MTLCommandBufferStatusError) {
GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
}
return GGML_STATUS_FAILED;
}
return GGML_STATUS_FAILED;
}
}
for (int i = 0; i < n_cb; ++i) {
id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[i].obj;
[cmd_buf waitUntilCompleted];
MTLCommandBufferStatus status = [cmd_buf status];
if (status != MTLCommandBufferStatusCompleted) {
GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
if (status == MTLCommandBufferStatusError) {
GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
id<MTLCommandBuffer> next_buffer = (i + 1 < n_cb ? ctx->cmd_bufs[i + 1].obj : nil);
if (!next_buffer) {
continue;
}
return GGML_STATUS_FAILED;
const bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued);
if (next_queued) {
continue;
}
if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) {
GGML_LOG_INFO("%s: command buffer %d aborted", __func__, i);
return GGML_STATUS_ABORTED;
}
[next_buffer commit];
}
} else {
struct ggml_metal_command_buffer cmd_buf_cur = ctx->cmd_bufs_next[(ctx->i_next + 1)%2];
// directly submit the command buffer that we have prepared in the previous iteration
[ctx->cmd_bufs_next[(ctx->i_next + 1)%2].obj commit];
// encode the command buffer for the next iter
{
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
[cmd_buf retain];
if (ctx->cmd_bufs_next[ctx->i_next].obj != nil) {
[ctx->cmd_bufs_next[ctx->i_next].obj release];
}
ctx->cmd_bufs_next[ctx->i_next].obj = cmd_buf;
ctx->encode_next();
}
id<MTLCommandBuffer> next_buffer = (i + 1 < n_cb ? ctx->cmd_bufs[i + 1].obj : nil);
if (!next_buffer) {
continue;
}
// wait for completion and check status of each command buffer
// needed to detect if the device ran out-of-memory for example (#1881)
{
id<MTLCommandBuffer> cmd_buf = cmd_buf_cur.obj;
[cmd_buf waitUntilCompleted];
const bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued);
if (next_queued) {
continue;
}
MTLCommandBufferStatus status = [cmd_buf status];
if (status != MTLCommandBufferStatusCompleted) {
GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, ctx->i_next, status);
if (status == MTLCommandBufferStatusError) {
GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
}
if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) {
GGML_LOG_INFO("%s: command buffer %d aborted", __func__, i);
return GGML_STATUS_ABORTED;
return GGML_STATUS_FAILED;
}
}
[next_buffer commit];
}
if (!should_capture && ctx->capture_started) {
@@ -5572,6 +5684,8 @@ static enum ggml_status ggml_metal_graph_compute(
}
}
//printf(" time = %.3f ms\n", (float)(ggml_time_us() - t_start)/1000.0f);
return GGML_STATUS_SUCCESS;
}
@@ -5919,6 +6033,10 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
Block_release(ctx->encode_async);
}
if (ctx->encode_next) {
Block_release(ctx->encode_next);
}
ctx->encode_async = Block_copy(^(size_t iter) {
const int cb_idx = iter;
const int n_cb_l = ctx->n_cb;
@@ -5967,6 +6085,40 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
[cmd_buf commit];
}
});
ctx->encode_next = Block_copy(^(void) {
id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs_next[ctx->i_next].obj;
id<MTLComputeCommandEncoder> encoder = [cmd_buf computeCommandEncoder];
int node_start = 0;
int node_end = ctx->gf->n_nodes;
const bool should_capture = ctx->capture_next_compute;
struct ggml_metal_mem_pool * mem_pool = ctx->cmd_bufs_next[ctx->i_next].mem_pool;
ggml_metal_mem_pool_reset(mem_pool);
for (int idx = node_start; idx < node_end; ++idx) {
if (should_capture) {
[encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
}
const bool res = ggml_metal_encode_node(backend, idx, encoder, mem_pool);
if (should_capture) {
[encoder popDebugGroup];
}
if (!res) {
break;
}
}
[encoder endEncoding];
ctx->i_next = (ctx->i_next + 1) % 2;
});
}
static struct ggml_backend_i ggml_backend_metal_i = {

View File

@@ -374,6 +374,8 @@ extern "C" {
bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
// NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
// ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
bool graph_reuse; // reuse previous compute graphs when possible
};
// model quantization parameters
@@ -1429,6 +1431,7 @@ extern "C" {
int32_t n_p_eval;
int32_t n_eval;
int32_t n_reused;
};
struct llama_perf_sampler_data {

View File

@@ -34,6 +34,31 @@ struct llama_ubatch {
llama_seq_id * seq_id_unq; // [n_seqs_unq] | s | seq_id
int32_t * seq_idx; // [LLAMA_MAX_SEQ] | - | seq_idx
int8_t * output; // [n_tokens] | i | -
bool is_same(const llama_ubatch & other) const {
bool res =
equal_seqs == other.equal_seqs &&
n_tokens == other.n_tokens &&
n_seq_tokens == other.n_seq_tokens &&
n_seqs == other.n_seqs &&
n_seqs_unq == other.n_seqs_unq &&
(
(!token && !other.token) ||
(!embd && !other.embd)
);
if (!res) {
return false;
}
// TODO: this won't work because seq_id_unq ptr can point to an old balloc that has
// been freed by this point. find a way to fix this
//for (uint32_t s = 0; s < n_seqs_unq; ++s) {
// res &= seq_id_unq[s] == other.seq_id_unq[s];
//}
return res;
}
};
// a helper for sanitizing, fulfilling and splitting a batch

View File

@@ -101,7 +101,8 @@ llama_context::llama_context(
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
cparams.op_offload = params.op_offload;
cparams.op_offload = params.op_offload;
cparams.graph_reuse = params.graph_reuse;
const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
@@ -227,8 +228,8 @@ llama_context::llama_context(
LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);
// buffer used to store the computation graph and the tensor meta data
buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
gf_res_prev.reset(new llm_graph_result(max_nodes));
gf_res_reserve.reset(new llm_graph_result(max_nodes));
// TODO: move these checks to ggml_backend_sched
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
@@ -388,10 +389,6 @@ ggml_backend_sched_t llama_context::get_sched() const {
return sched.get();
}
ggml_context * llama_context::get_ctx_compute() const {
return ctx_compute.get();
}
uint32_t llama_context::n_ctx() const {
return cparams.n_ctx;
}
@@ -678,38 +675,52 @@ bool llama_context::apply_adapter_cvec(
return cvec.apply(model, data, len, n_embd, il_start, il_end);
}
llm_graph_result_ptr llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
llm_graph_result_i * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
if (mctx && !mctx->apply()) {
LLAMA_LOG_ERROR("%s: failed to apply memory context\n", __func__);
ret = GGML_STATUS_FAILED;
return nullptr;
}
auto * gf = graph_init();
if (!gf) {
LLAMA_LOG_ERROR("%s: failed to initialize graph\n", __func__);
ret = GGML_STATUS_FAILED;
return nullptr;
}
auto * res = gf_res_prev.get();
auto * gf = res->get_gf();
auto res = graph_build(ctx_compute.get(), gf, ubatch, gtype, mctx);
if (!res) {
LLAMA_LOG_ERROR("%s: failed to build graph\n", __func__);
ret = GGML_STATUS_FAILED;
return nullptr;
}
// the new graph parameters
// in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters
const auto gparams = graph_params(res, ubatch, mctx, gtype);
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
const bool can_reuse = cparams.graph_reuse && res->update(gparams);
if (can_reuse) {
LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
n_reused++;
} else {
res->reset();
if (!ggml_backend_sched_alloc_graph(sched.get(), gf)) {
LLAMA_LOG_ERROR("%s: failed to allocate graph\n", __func__);
ret = GGML_STATUS_ALLOC_FAILED;
return nullptr;
ggml_backend_sched_reset(sched.get());
ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
//const auto t_start_us = ggml_time_us();
gf = model.build_graph(gparams);
//LLAMA_LOG_INFO("graph build time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0);
if (!gf) {
LLAMA_LOG_ERROR("%s: failed to initialize graph\n", __func__);
ret = GGML_STATUS_FAILED;
return nullptr;
}
if (!ggml_backend_sched_alloc_graph(sched.get(), gf)) {
LLAMA_LOG_ERROR("%s: failed to allocate graph\n", __func__);
ret = GGML_STATUS_ALLOC_FAILED;
return nullptr;
}
}
res->set_inputs(&ubatch);
const auto status = graph_compute(gf, ubatch.n_tokens > 1);
const auto status = graph_compute(res->get_gf(), ubatch.n_tokens > 1);
if (status != GGML_STATUS_SUCCESS) {
LLAMA_LOG_ERROR("%s: failed to compute graph, compute status: %d\n", __func__, status);
ret = status;
@@ -767,9 +778,6 @@ int llama_context::encode(const llama_batch & batch_inp) {
n_outputs = n_tokens;
ggml_backend_sched_reset(sched.get());
ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
const auto causal_attn_org = cparams.causal_attn;
// always use non-causal attention for encoder graphs
@@ -778,7 +786,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
cparams.causal_attn = false;
ggml_status status;
const auto res = process_ubatch(ubatch, LLM_GRAPH_TYPE_ENCODER, nullptr, status);
const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_ENCODER, nullptr, status);
cparams.causal_attn = causal_attn_org;
@@ -846,7 +854,9 @@ int llama_context::encode(const llama_batch & batch_inp) {
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
// overlap with device computation.
ggml_backend_sched_reset(sched.get());
if (!cparams.graph_reuse) {
ggml_backend_sched_reset(sched.get());
}
// TODO: hacky solution
if (model.arch == LLM_ARCH_T5 && t_embd) {
@@ -1005,11 +1015,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
n_outputs = n_outputs_new;
}
ggml_backend_sched_reset(sched.get());
ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
ggml_status status;
const auto res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status);
const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status);
if (!res) {
// the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache
@@ -1192,7 +1199,9 @@ int llama_context::decode(const llama_batch & batch_inp) {
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
// overlap with device computation.
ggml_backend_sched_reset(sched.get());
if (!cparams.graph_reuse) {
ggml_backend_sched_reset(sched.get());
}
return 0;
}
@@ -1275,20 +1284,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
// graph
//
int32_t llama_context::graph_max_nodes() const {
return std::max<int32_t>(65536, 5*model.n_tensors());
}
ggml_cgraph * llama_context::graph_init() {
ggml_init_params params = {
/*.mem_size =*/ buf_compute_meta.size(),
/*.mem_buffer =*/ buf_compute_meta.data(),
/*.no_alloc =*/ true,
};
ctx_compute.reset(ggml_init(params));
return ggml_new_graph_custom(ctx_compute.get(), graph_max_nodes(), false);
uint32_t llama_context::graph_max_nodes() const {
return std::max<uint32_t>(65536u, 5u*model.n_tensors());
}
ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx) {
@@ -1301,6 +1298,9 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs);
}
gf_res_prev->reset();
ggml_backend_sched_reset(sched.get());
// store the n_outputs as it is, and restore it afterwards
// TODO: not sure if needed, might simplify in the future by removing this
const auto save_n_outputs = this->n_outputs;
@@ -1310,18 +1310,16 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
llama_batch_allocr balloc(model.hparams.n_pos_per_embd());
llama_ubatch ubatch = balloc.ubatch_reserve(n_tokens/n_seqs, n_seqs);
auto * gf = graph_init();
auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx);
auto * res = gf_res_reserve.get();
const auto gparams = graph_params(res, ubatch, mctx, LLM_GRAPH_TYPE_DEFAULT);
res->reset();
auto * gf = model.build_graph(gparams);
this->n_outputs = save_n_outputs;
if (!res) {
LLAMA_LOG_ERROR("%s: failed to build worst-case graph\n", __func__);
return nullptr;
}
ggml_backend_sched_reset(sched.get());
// initialize scheduler with the specified graph
if (!ggml_backend_sched_reserve(sched.get(), gf)) {
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
@@ -1331,28 +1329,27 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
return gf;
}
llm_graph_result_ptr llama_context::graph_build(
ggml_context * ctx,
ggml_cgraph * gf,
const llama_ubatch & ubatch,
llm_graph_type gtype,
const llama_memory_context_i * mctx) {
return model.build_graph(
{
/*.ctx =*/ ctx,
/*.arch =*/ model.arch,
/*.hparams =*/ model.hparams,
/*.cparams =*/ cparams,
/*.ubatch =*/ ubatch,
/*.sched =*/ sched.get(),
/*.backend_cpu =*/ backend_cpu,
/*.cvec =*/ &cvec,
/*.loras =*/ &loras,
/*.mctx =*/ mctx,
/*.cross =*/ &cross,
/*.n_outputs =*/ n_outputs,
/*.cb =*/ graph_get_cb(),
}, gf, gtype);
llm_graph_params llama_context::graph_params(
llm_graph_result_i * res,
const llama_ubatch & ubatch,
const llama_memory_context_i * mctx,
llm_graph_type gtype) const {
return {
/*.arch =*/ model.arch,
/*.hparams =*/ model.hparams,
/*.cparams =*/ cparams,
/*.ubatch =*/ ubatch,
/*.gtype =*/ gtype,
/*.sched =*/ sched.get(),
/*.backend_cpu =*/ backend_cpu,
/*.cvec =*/ &cvec,
/*.loras =*/ &loras,
/*.mctx =*/ mctx,
/*.cross =*/ &cross,
/*.n_outputs =*/ n_outputs,
/*.cb =*/ graph_get_cb(),
/*.res =*/ res,
};
}
ggml_status llama_context::graph_compute(
@@ -1930,6 +1927,7 @@ llama_perf_context_data llama_context::perf_get_data() const {
data.t_eval_ms = 1e-3 * t_eval_us;
data.n_p_eval = std::max(1, n_p_eval);
data.n_eval = std::max(1, n_eval);
data.n_reused = std::max(0, n_reused);
return data;
}
@@ -1938,6 +1936,7 @@ void llama_context::perf_reset() {
t_start_us = ggml_time_us();
t_eval_us = n_eval = 0;
t_p_eval_us = n_p_eval = 0;
n_reused = 0;
}
//
@@ -2064,8 +2063,13 @@ void llama_context::opt_epoch_iter(
break;
}
auto * gf = graph_init();
auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx.get());
auto * res = gf_res_prev.get();
const auto gparams = graph_params(res, ubatch, mctx.get(), LLM_GRAPH_TYPE_DEFAULT);
res->reset();
auto * gf = model.build_graph(gparams);
struct ggml_context * ctx_compute_opt;
{
@@ -2187,6 +2191,7 @@ llama_context_params llama_context_default_params() {
/*.no_perf =*/ true,
/*.op_offload =*/ true,
/*.swa_full =*/ true,
/*.graph_reuse =*/ false,
};
return result;
@@ -2807,6 +2812,7 @@ void llama_perf_context_print(const llama_context * ctx) {
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
LLAMA_LOG_INFO("%s: graphs reused = %10d\n", __func__, data.n_reused);
}
void llama_perf_context_reset(llama_context * ctx) {

View File

@@ -35,8 +35,6 @@ struct llama_context {
ggml_backend_sched_t get_sched() const;
ggml_context * get_ctx_compute() const;
uint32_t n_ctx() const;
uint32_t n_ctx_per_seq() const;
uint32_t n_batch() const;
@@ -96,7 +94,7 @@ struct llama_context {
// if memory_context is provided, it will be applied first to the context's memory
// ret contains the status of the graph computation
// returns nullptr only if ret != GGML_STATUS_SUCCESS
llm_graph_result_ptr process_ubatch(
llm_graph_result_i * process_ubatch(
const llama_ubatch & ubatch,
llm_graph_type gtype,
llama_memory_context_i * mctx,
@@ -188,10 +186,7 @@ private:
//
public:
int32_t graph_max_nodes() const;
// zero-out inputs and create the ctx_compute for the compute graph
ggml_cgraph * graph_init();
uint32_t graph_max_nodes() const;
// returns the result of ggml_backend_sched_graph_compute_async execution
ggml_status graph_compute(ggml_cgraph * gf, bool batched);
@@ -200,12 +195,11 @@ public:
ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx);
private:
llm_graph_result_ptr graph_build(
ggml_context * ctx,
ggml_cgraph * gf,
const llama_ubatch & ubatch,
llm_graph_type gtype,
const llama_memory_context_i * mctx);
llm_graph_params graph_params(
llm_graph_result_i * res,
const llama_ubatch & ubatch,
const llama_memory_context_i * mctx,
llm_graph_type gtype) const;
llm_graph_cb graph_get_cb() const;
@@ -258,8 +252,6 @@ private:
ggml_backend_t backend_cpu = nullptr;
std::vector<ggml_backend_ptr> backends;
ggml_context_ptr ctx_compute;
// training
ggml_opt_context_t opt_ctx = nullptr;
@@ -275,8 +267,8 @@ private:
std::vector<ggml_backend_t> backend_ptrs;
std::vector<ggml_backend_buffer_type_t> backend_buft;
// memory buffers used to evaluate the model
std::vector<uint8_t> buf_compute_meta;
llm_graph_result_ptr gf_res_prev;
llm_graph_result_ptr gf_res_reserve;
// host buffer for the model output (logits and embeddings)
ggml_backend_buffer_ptr buf_output;
@@ -294,4 +286,6 @@ private:
mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
mutable int32_t n_eval = 0; // number of eval calls
mutable int32_t n_reused = 0; // number of times the previous graph was reused
};

View File

@@ -3,3 +3,32 @@
size_t llama_max_parallel_sequences(void) {
return LLAMA_MAX_SEQ;
}
bool llama_cparams::is_same(const llama_cparams & other) const {
return
n_ctx == other.n_ctx &&
n_batch == other.n_batch &&
n_ubatch == other.n_ubatch &&
n_seq_max == other.n_seq_max &&
n_threads == other.n_threads &&
n_threads_batch == other.n_threads_batch &&
rope_freq_base == other.rope_freq_base &&
rope_freq_scale == other.rope_freq_scale &&
n_ctx_orig_yarn == other.n_ctx_orig_yarn &&
yarn_ext_factor == other.yarn_ext_factor &&
yarn_attn_factor == other.yarn_attn_factor &&
yarn_beta_fast == other.yarn_beta_fast &&
yarn_beta_slow == other.yarn_beta_slow &&
defrag_thold == other.defrag_thold &&
embeddings == other.embeddings &&
causal_attn == other.causal_attn &&
offload_kqv == other.offload_kqv &&
flash_attn == other.flash_attn &&
no_perf == other.no_perf &&
warmup == other.warmup &&
op_offload == other.op_offload &&
graph_reuse == other.graph_reuse &&
pooling_type == other.pooling_type &&
cb_eval == other.cb_eval &&
cb_eval_user_data == other.cb_eval_user_data;
}

View File

@@ -33,9 +33,12 @@ struct llama_cparams {
bool no_perf;
bool warmup;
bool op_offload;
bool graph_reuse;
enum llama_pooling_type pooling_type;
ggml_backend_sched_eval_callback cb_eval;
void * cb_eval_user_data;
bool is_same(const llama_cparams & other) const;
};

View File

@@ -28,6 +28,15 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
}
}
bool llm_graph_input_embd::update(const llm_graph_params & params) {
bool res = true;
res &= (!tokens && !params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens);
res &= (!embd && !params.ubatch.embd) || (embd && embd->ne[0] == params.ubatch.n_tokens);
return res;
}
void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
if (ubatch->pos && pos) {
const int64_t n_tokens = ubatch->n_tokens;
@@ -50,6 +59,14 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
}
}
bool llm_graph_input_pos::update(const llm_graph_params & params) {
bool res = true;
res &= pos->ne[0] == params.ubatch.n_tokens;
return res;
}
void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
if (ubatch->pos && attn_scale) {
const int64_t n_tokens = ubatch->n_tokens;
@@ -118,6 +135,14 @@ void llm_graph_input_out_ids::set_input(const llama_ubatch * ubatch) {
}
}
bool llm_graph_input_out_ids::update(const llm_graph_params & params) {
bool res = true;
res &= n_outputs == params.n_outputs;
return res;
}
void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
const int64_t n_tokens = ubatch->n_tokens;
@@ -287,6 +312,24 @@ void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
}
bool llm_graph_input_attn_kv_unified::update(const llm_graph_params & params) {
const auto * mctx = static_cast<const llama_kv_cache_unified_context *>(params.mctx);
this->mctx = mctx;
bool res = true;
res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
res &= self_v_idxs->ne[0] == params.ubatch.n_tokens;
res &= self_kq_mask->ne[0] == mctx->get_n_kv();
res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
res &= mctx->get_supports_set_rows(); // TODO: tmp
return res;
}
void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch) {
mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
@@ -299,6 +342,30 @@ void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch
mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
}
bool llm_graph_input_attn_kv_unified_iswa::update(const llm_graph_params & params) {
const auto * mctx = static_cast<const llama_kv_cache_unified_iswa_context *>(params.mctx);
this->mctx = mctx;
bool res = true;
res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
res &= self_v_idxs->ne[0] == params.ubatch.n_tokens;
res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens;
res &= self_kq_mask->ne[0] == mctx->get_base()->get_n_kv();
res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv();
res &= self_kq_mask_swa->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
res &= mctx->get_base()->get_supports_set_rows(); // TODO: tmp
return res;
}
void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
GGML_ASSERT(cross_kq_mask);
@@ -395,7 +462,6 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
n_ctx_orig (cparams.n_ctx_orig_yarn),
pooling_type (cparams.pooling_type),
rope_type (hparams.rope_type),
ctx0 (params.ctx),
sched (params.sched),
backend_cpu (params.backend_cpu),
cvec (params.cvec),
@@ -403,7 +469,8 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
mctx (params.mctx),
cross (params.cross),
cb_func (params.cb),
res (std::make_unique<llm_graph_result>()) {
res (static_cast<llm_graph_result *>(params.res)),
ctx0 (res->get_ctx()) {
}
void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const {

View File

@@ -1,6 +1,7 @@
#pragma once
#include "llama-arch.h"
#include "llama-batch.h"
#include "llama-hparams.h"
#include "llama-adapter.h"
@@ -14,7 +15,6 @@ struct ggml_cgraph;
struct ggml_context;
struct ggml_tensor;
struct llama_ubatch;
struct llama_cparams;
struct llama_memory_context_i;
@@ -69,6 +69,8 @@ struct llama_cross {
std::vector<std::set<llama_seq_id>> seq_ids_enc;
};
struct llm_graph_params;
//
// llm_graph_input
//
@@ -78,11 +80,19 @@ public:
virtual ~llm_graph_input_i() = default;
virtual void set_input(const llama_ubatch * ubatch) = 0;
// return true if the resulting input tensors using the provided graph parameters would be
// the same as the previous input tensors that we have currently stored in the object
virtual bool update(const llm_graph_params & params) {
// returning false here by default will prevent from reusing the graph if the check
// for the input type has not been implemented yet
GGML_UNUSED(params);
return false;
}
};
using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
class llm_graph_input_embd : public llm_graph_input_i {
public:
llm_graph_input_embd() = default;
@@ -90,6 +100,8 @@ public:
void set_input(const llama_ubatch * ubatch) override;
bool update(const llm_graph_params & params) override;
ggml_tensor * tokens = nullptr; // I32 [n_batch]
ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch]
};
@@ -101,6 +113,8 @@ public:
void set_input(const llama_ubatch * ubatch) override;
bool update(const llm_graph_params & params) override;
ggml_tensor * pos = nullptr; // I32 [n_batch]
const uint32_t n_pos_per_embd = 1;
@@ -154,17 +168,19 @@ public:
llm_graph_input_out_ids(
const llama_hparams & hparams,
const llama_cparams & cparams,
int32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {}
uint32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {}
virtual ~llm_graph_input_out_ids() = default;
void set_input(const llama_ubatch * ubatch) override;
bool update(const llm_graph_params & params) override;
ggml_tensor * out_ids; // I32 [n_outputs]
const llama_hparams & hparams;
const llama_cparams & cparams;
const int32_t n_outputs;
const uint32_t n_outputs;
};
class llm_graph_input_mean : public llm_graph_input_i {
@@ -249,6 +265,8 @@ public:
void set_input(const llama_ubatch * ubatch) override;
bool update(const llm_graph_params & params) override;
ggml_tensor * get_k_idxs() const { return self_k_idxs; }
ggml_tensor * get_v_idxs() const { return self_v_idxs; }
@@ -280,6 +298,8 @@ public:
void set_input(const llama_ubatch * ubatch) override;
bool update(const llm_graph_params & params) override;
ggml_tensor * get_k_idxs() const { return self_k_idxs; }
ggml_tensor * get_v_idxs() const { return self_v_idxs; }
ggml_tensor * get_k_idxs_swa() const { return self_k_idxs_swa; }
@@ -373,29 +393,110 @@ public:
// along with the input tensors, the object also provides commonly used outputs tensors, such as logits, embeddings, etc.
// these are used by the llama_context to extact the relevant data, based on the compute parameters
// TODO: this interface seems redundant - remove it
class llm_graph_result_i {
public:
virtual ~llm_graph_result_i() = default;
virtual ggml_tensor * get_tokens() = 0;
virtual ggml_tensor * get_logits() = 0;
virtual ggml_tensor * get_embd() = 0;
virtual ggml_tensor * get_embd_pooled() = 0;
virtual ggml_tensor * get_tokens() const = 0;
virtual ggml_tensor * get_logits() const = 0;
virtual ggml_tensor * get_embd() const = 0;
virtual ggml_tensor * get_embd_pooled() const = 0;
virtual ggml_cgraph * get_gf() = 0;
virtual ggml_context * get_ctx() = 0;
virtual void reset() = 0;
virtual void set_inputs(const llama_ubatch * ubatch) = 0;
virtual bool update(const llm_graph_params & params) = 0;
};
using llm_graph_result_ptr = std::unique_ptr<llm_graph_result_i>;
// callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
using llm_graph_cb = std::function<void(const llama_ubatch & ubatch, ggml_tensor * cur, const char * name, int il)>;
struct llm_graph_params {
llm_arch arch = LLM_ARCH_UNKNOWN;
llama_hparams hparams;
llama_cparams cparams;
llama_ubatch ubatch; // note: intentionally make a copy
llm_graph_type gtype;
ggml_backend_sched_t sched;
ggml_backend_t backend_cpu;
const llama_adapter_cvec * cvec;
const llama_adapter_loras * loras;
const llama_memory_context_i * mctx;
const llama_cross * cross;
uint32_t n_outputs;
llm_graph_cb cb;
// TODO: temporary
llm_graph_result_i * res;
bool is_same(const llm_graph_params & other) const {
return
hparams.is_same(other.hparams) &&
cparams.is_same(other.cparams) &&
ubatch .is_same(other.ubatch) &&
arch == other.arch &&
gtype == other.gtype &&
cvec == other.cvec &&
loras == other.loras &&
cross == other.cross &&
n_outputs == other.n_outputs;
}
};
class llm_graph_result : public llm_graph_result_i {
public:
llm_graph_result(int64_t max_nodes) : max_nodes(max_nodes) {
reset();
}
virtual ~llm_graph_result() = default;
ggml_tensor * get_tokens() override { return t_tokens; }
ggml_tensor * get_logits() override { return t_logits; }
ggml_tensor * get_embd() override { return t_embd; }
ggml_tensor * get_embd_pooled() override { return t_embd_pooled; }
ggml_tensor * get_tokens() const override { return t_tokens; }
ggml_tensor * get_logits() const override { return t_logits; }
ggml_tensor * get_embd() const override { return t_embd; }
ggml_tensor * get_embd_pooled() const override { return t_embd_pooled; }
ggml_cgraph * get_gf() override { return gf; }
ggml_context * get_ctx() override { return ctx_compute.get(); }
void set_max_nodes(int64_t max_nodes) {
this->max_nodes = max_nodes;
}
void reset() override {
t_tokens = nullptr;
t_logits = nullptr;
t_embd = nullptr;
t_embd_pooled = nullptr;
inputs.clear();
buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
ggml_init_params params = {
/*.mem_size =*/ buf_compute_meta.size(),
/*.mem_buffer =*/ buf_compute_meta.data(),
/*.no_alloc =*/ true,
};
ctx_compute.reset(ggml_init(params));
gf = ggml_new_graph_custom(ctx_compute.get(), max_nodes, false);
}
void set_inputs(const llama_ubatch * ubatch) override {
for (auto & input : inputs) {
@@ -403,6 +504,25 @@ public:
}
}
// try to update the existing graph result using the new graph parameters
// this can only be done if we determine that the resulting graph using the new graph parameters
// would be identical to the existing graph. in that case, we simply have to update the memory
// contexts of the input tensors of the graph and we can reuse it for another computation
// return true if the graph was updated and can be reused
bool update(const llm_graph_params & params) override {
if (!this->params.is_same(params)) {
return false;
}
bool res = true;
for (auto & input : inputs) {
res &= input->update(params);
}
return res;
}
llm_graph_input_i * add_input(llm_graph_input_ptr input) {
inputs.emplace_back(std::move(input));
return inputs.back().get();
@@ -415,37 +535,26 @@ public:
ggml_tensor * t_embd_pooled = nullptr;
std::vector<llm_graph_input_ptr> inputs;
ggml_context_ptr ctx_compute;
// memory buffers used to evaluate the model
std::vector<uint8_t> buf_compute_meta;
ggml_cgraph * gf;
int64_t max_nodes;
// keep a copy of the previous graph parameters
// we will use this to determine whether the graph can be reused by comparing them with the new parameters
// note: these are updated after constructing the new graph
llm_graph_params params;
};
//
// llm_graph_context
//
// callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
using llm_graph_cb = std::function<void(const llama_ubatch & ubatch, ggml_tensor * cur, const char * name, int il)>;
struct llm_graph_params {
ggml_context * ctx;
const llm_arch arch;
const llama_hparams & hparams;
const llama_cparams & cparams;
const llama_ubatch & ubatch;
ggml_backend_sched_t sched;
ggml_backend_t backend_cpu;
const llama_adapter_cvec * cvec;
const llama_adapter_loras * loras;
const llama_memory_context_i * mctx;
const llama_cross * cross;
uint32_t n_outputs;
const llm_graph_cb & cb;
};
// used in build_rs to properly order writes and avoid unnecessary copies
using llm_graph_get_rows_fn = std::function<ggml_tensor * (ggml_context *, ggml_tensor * states, ggml_tensor * ids)>;
@@ -485,8 +594,6 @@ struct llm_graph_context {
const enum llama_pooling_type pooling_type;
const enum llama_rope_type rope_type;
ggml_context * ctx0 = nullptr;
ggml_backend_sched_t sched;
ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
@@ -498,7 +605,9 @@ struct llm_graph_context {
const llm_graph_cb & cb_func;
std::unique_ptr<llm_graph_result> res;
llm_graph_result * res;
ggml_context * ctx0 = nullptr;
llm_graph_context(const llm_graph_params & params);
virtual ~llm_graph_context() = default;

View File

@@ -102,3 +102,12 @@ bool llama_hparams::is_swa(uint32_t il) const {
GGML_ABORT("fatal error");
}
bool llama_hparams::is_same(const llama_hparams & other) const {
return
n_ctx_train == other.n_ctx_train &&
n_embd == other.n_embd &&
n_layer == other.n_layer &&
n_expert == other.n_expert &&
n_expert_used == other.n_expert_used;
}

View File

@@ -202,6 +202,8 @@ struct llama_hparams {
uint32_t n_pos_per_embd() const;
bool is_swa(uint32_t il) const;
bool is_same(const llama_hparams & other) const;
};
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");

View File

@@ -68,6 +68,8 @@ llama_kv_cache_unified::llama_kv_cache_unified(
cells.resize(kv_size);
gf_res.reset(new llm_graph_result(32768)); // note: the max nodes will be updated later
for (uint32_t il = 0; il < n_layer_cache; il++) {
if (filter && !filter(il)) {
LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il);
@@ -158,7 +160,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) : 0;
supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 : 0;
if (!supports_set_rows) {
LLAMA_LOG_WARN("%s: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility\n", __func__);
@@ -480,14 +482,12 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d
if (hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
ggml_backend_sched_reset(sched);
auto * gf = lctx->graph_init();
auto * res = gf_res.get();
auto res = build_graph_shift(lctx->get_cparams(), lctx->get_ctx_compute(), gf);
if (!res) {
LLAMA_LOG_ERROR("%s: failed to build graph for K-shift\n", __func__);
return updated;
}
res->set_max_nodes(lctx->graph_max_nodes());
res->reset();
auto * gf = build_graph_shift(res, lctx);
if (!ggml_backend_sched_alloc_graph(sched, gf)) {
LLAMA_LOG_ERROR("%s: failed to allocate compute graph for K-shift\n", __func__);
return updated;
@@ -529,14 +529,12 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d
ggml_backend_sched_reset(sched);
auto * gf = lctx->graph_init();
auto * res = gf_res.get();
auto res = build_graph_defrag(lctx->get_cparams(), lctx->get_ctx_compute(), gf, dinfo);
if (!res) {
LLAMA_LOG_ERROR("%s: failed to build graph for defrag\n", __func__);
return updated;
}
res->set_max_nodes(lctx->graph_max_nodes());
res->reset();
auto * gf = build_graph_defrag(res, lctx, dinfo);
if (!ggml_backend_sched_alloc_graph(sched, gf)) {
LLAMA_LOG_ERROR("%s: failed to allocate compute graph for defrag\n", __func__);
return updated;
@@ -780,6 +778,10 @@ uint32_t llama_kv_cache_unified::get_n_kv() const {
return std::min(cells.size(), std::max(n_pad, GGML_PAD(cells.used_max_p1(), n_pad)));
}
bool llama_kv_cache_unified::get_supports_set_rows() const {
return supports_set_rows;
}
ggml_tensor * llama_kv_cache_unified::get_k(ggml_context * ctx, int32_t il, uint32_t n_kv) const {
const int32_t ikv = map_layer_ids.at(il);
@@ -1142,11 +1144,9 @@ void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
}
}
llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
const llama_cparams & cparams,
ggml_context * ctx,
ggml_cgraph * gf) const {
auto res = std::make_unique<llm_graph_result>();
ggml_cgraph * llama_kv_cache_unified::build_graph_shift(llm_graph_result * res, llama_context * lctx) const {
auto * ctx = res->get_ctx();
auto * gf = res->get_gf();
const auto & n_embd_head_k = hparams.n_embd_head_k;
//const auto & n_embd_head_v = hparams.n_embd_head_v;
@@ -1156,6 +1156,8 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, cells.size());
ggml_set_input(inp->k_shift);
const auto & cparams = lctx->get_cparams();
for (const auto & layer : layers) {
const uint32_t il = layer.il;
@@ -1181,18 +1183,20 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
res->add_input(std::move(inp));
return res;
return gf;
}
llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
const llama_cparams & cparams,
ggml_context * ctx,
ggml_cgraph * gf,
const defrag_info & dinfo) const {
auto res = std::make_unique<llm_graph_result>();
ggml_cgraph * llama_kv_cache_unified::build_graph_defrag(
llm_graph_result * res,
llama_context * lctx,
const defrag_info & dinfo) const {
auto * ctx = res->get_ctx();
auto * gf = res->get_gf();
const auto & ids = dinfo.ids;
const auto & cparams = lctx->get_cparams();
#if 0
// CPU defrag
//
@@ -1329,7 +1333,7 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
#endif
return res;
return gf;
}
llama_kv_cache_unified::defrag_info llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) const {
@@ -1940,6 +1944,10 @@ uint32_t llama_kv_cache_unified_context::get_n_kv() const {
return n_kv;
}
bool llama_kv_cache_unified_context::get_supports_set_rows() const {
return kv->get_supports_set_rows();
}
ggml_tensor * llama_kv_cache_unified_context::get_k(ggml_context * ctx, int32_t il) const {
return kv->get_k(ctx, il, n_kv);
}

View File

@@ -121,6 +121,9 @@ public:
uint32_t get_n_kv() const;
// TODO: temporary
bool get_supports_set_rows() const;
// get views of the current state of the cache
ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv) const;
ggml_tensor * get_v(ggml_context * ctx, int32_t il, uint32_t n_kv) const;
@@ -193,13 +196,15 @@ private:
// env: LLAMA_SET_ROWS (temporary)
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
int supports_set_rows = false;
bool supports_set_rows = false;
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
std::vector<ggml_context_ptr> ctxs;
std::vector<ggml_backend_buffer_ptr> bufs;
std::unique_ptr<llm_graph_result> gf_res;
llama_kv_cells_unified cells;
std::vector<kv_layer> layers;
@@ -226,15 +231,13 @@ private:
float freq_base,
float freq_scale) const;
llm_graph_result_ptr build_graph_shift(
const llama_cparams & cparams,
ggml_context * ctx,
ggml_cgraph * gf) const;
ggml_cgraph * build_graph_shift(
llm_graph_result * res,
llama_context * lctx) const;
llm_graph_result_ptr build_graph_defrag(
const llama_cparams & cparams,
ggml_context * ctx,
ggml_cgraph * gf,
ggml_cgraph * build_graph_defrag(
llm_graph_result * res,
llama_context * lctx,
const defrag_info & dinfo) const;
void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
@@ -288,6 +291,9 @@ public:
uint32_t get_n_kv() const;
// TODO: temporary
bool get_supports_set_rows() const;
// get views of the current state of the cache
ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;

View File

@@ -14751,10 +14751,10 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
return res;
}
llm_graph_result_ptr llama_model::build_graph(
const llm_graph_params & params,
ggml_cgraph * gf,
llm_graph_type type) const {
ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
// TODO: temporary - will refactor this to keep the "gf" instance in the llm_graph_context and avoid passing it everywhere
auto * gf = params.res->get_gf();
std::unique_ptr<llm_graph_context> llm;
switch (arch) {
@@ -14961,7 +14961,7 @@ llm_graph_result_ptr llama_model::build_graph(
} break;
case LLM_ARCH_T5:
{
switch (type) {
switch (params.gtype) {
case LLM_GRAPH_TYPE_ENCODER:
llm = std::make_unique<llm_build_t5_enc>(*this, params, gf);
break;
@@ -15047,7 +15047,10 @@ llm_graph_result_ptr llama_model::build_graph(
// add on pooling layer
llm->build_pooling(gf, cls, cls_b, cls_out, cls_out_b);
return std::move(llm->res);
// TODO: updating the graph parameters here is a little bit obscure - figure out something better
llm->res->params = params;
return llm->res->get_gf();
}
//

View File

@@ -436,10 +436,7 @@ struct llama_model {
llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
// TODO: move this to new llm_arch_model_i interface
llm_graph_result_ptr build_graph(
const llm_graph_params & params,
ggml_cgraph * gf,
llm_graph_type type) const;
ggml_cgraph * build_graph(const llm_graph_params & params) const;
private:
struct impl;

View File

@@ -261,6 +261,7 @@ struct cmd_params {
std::vector<bool> use_mmap;
std::vector<bool> embeddings;
std::vector<bool> no_op_offload;
std::vector<bool> graph_reuse;
ggml_numa_strategy numa;
int reps;
ggml_sched_priority prio;
@@ -298,6 +299,7 @@ static const cmd_params cmd_params_defaults = {
/* use_mmap */ { true },
/* embeddings */ { false },
/* no_op_offload */ { false },
/* graph_reuse */ { false },
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
/* reps */ 5,
/* prio */ GGML_SCHED_PRIO_NORMAL,
@@ -377,6 +379,7 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -ot --override-tensors <tensor name pattern>=<buffer type>;...\n");
printf(" (default: disabled)\n");
printf(" -nopo, --no-op-offload <0|1> (default: 0)\n");
printf(" -gr, --graph-reuse <0|1> (default: 0)\n");
printf("\n");
printf(
"Multiple values can be given for each parameter by separating them with ','\n"
@@ -620,6 +623,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
}
auto p = string_split<bool>(argv[i], split_delim);
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
} else if (arg == "-gr" || arg == "--graph-reuse") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<bool>(argv[i], split_delim);
params.graph_reuse.insert(params.graph_reuse.end(), p.begin(), p.end());
} else if (arg == "--numa") {
if (++i >= argc) {
invalid_param = true;
@@ -885,6 +895,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.no_op_offload.empty()) {
params.no_op_offload = cmd_params_defaults.no_op_offload;
}
if (params.graph_reuse.empty()) {
params.graph_reuse = cmd_params_defaults.graph_reuse;
}
if (params.n_threads.empty()) {
params.n_threads = cmd_params_defaults.n_threads;
}
@@ -926,6 +939,7 @@ struct cmd_params_instance {
bool use_mmap;
bool embeddings;
bool no_op_offload;
bool graph_reuse;
llama_model_params to_llama_mparams() const {
llama_model_params mparams = llama_model_default_params();
@@ -998,6 +1012,7 @@ struct cmd_params_instance {
cparams.embeddings = embeddings;
cparams.op_offload = !no_op_offload;
cparams.swa_full = false;
cparams.graph_reuse = graph_reuse;
return cparams;
}
@@ -1018,6 +1033,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
for (const auto & mmp : params.use_mmap)
for (const auto & embd : params.embeddings)
for (const auto & nopo : params.no_op_offload)
for (const auto & gr : params.graph_reuse)
for (const auto & nb : params.n_batch)
for (const auto & nub : params.n_ubatch)
for (const auto & tk : params.type_k)
@@ -1059,6 +1075,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .use_mmap = */ mmp,
/* .embeddings = */ embd,
/* .no_op_offload= */ nopo,
/* .graph_reuse = */ gr,
};
instances.push_back(instance);
}
@@ -1092,6 +1109,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .use_mmap = */ mmp,
/* .embeddings = */ embd,
/* .no_op_offload= */ nopo,
/* .graph_reuse = */ gr,
};
instances.push_back(instance);
}
@@ -1125,6 +1143,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .use_mmap = */ mmp,
/* .embeddings = */ embd,
/* .no_op_offload= */ nopo,
/* .graph_reuse = */ gr,
};
instances.push_back(instance);
}
@@ -1162,6 +1181,7 @@ struct test {
bool use_mmap;
bool embeddings;
bool no_op_offload;
bool graph_reuse;
int n_prompt;
int n_gen;
int n_depth;
@@ -1197,6 +1217,7 @@ struct test {
use_mmap = inst.use_mmap;
embeddings = inst.embeddings;
no_op_offload = inst.no_op_offload;
graph_reuse = inst.graph_reuse;
n_prompt = inst.n_prompt;
n_gen = inst.n_gen;
n_depth = inst.n_depth;
@@ -1243,8 +1264,8 @@ struct test {
"cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
"split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides",
"defrag_thold",
"use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", "test_time",
"avg_ns", "stddev_ns", "avg_ts", "stddev_ts",
"use_mmap", "embeddings", "no_op_offload", "graph_reuse", "n_prompt", "n_gen", "n_depth",
"test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts",
};
return fields;
}
@@ -1259,7 +1280,7 @@ struct test {
return INT;
}
if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
field == "use_mmap" || field == "embeddings") {
field == "use_mmap" || field == "embeddings" || field == "graph_reuse") {
return BOOL;
}
if (field == "avg_ts" || field == "stddev_ts" || field == "defrag_thold") {
@@ -1333,6 +1354,7 @@ struct test {
std::to_string(use_mmap),
std::to_string(embeddings),
std::to_string(no_op_offload),
std::to_string(graph_reuse),
std::to_string(n_prompt),
std::to_string(n_gen),
std::to_string(n_depth),
@@ -1518,6 +1540,9 @@ struct markdown_printer : public printer {
if (field == "no_op_offload") {
return 4;
}
if (field == "graph_reuse") {
return 4;
}
int width = std::max((int) field.length(), 10);
@@ -1552,6 +1577,9 @@ struct markdown_printer : public printer {
if (field == "no_op_offload") {
return "nopo";
}
if (field == "graph_reuse") {
return "gr";
}
if (field == "tensor_split") {
return "ts";
}
@@ -1626,6 +1654,9 @@ struct markdown_printer : public printer {
if (params.no_op_offload.size() > 1 || params.no_op_offload != cmd_params_defaults.no_op_offload) {
fields.emplace_back("no_op_offload");
}
if (params.graph_reuse.size() > 1 || params.graph_reuse != cmd_params_defaults.graph_reuse) {
fields.emplace_back("graph_reuse");
}
fields.emplace_back("test");
fields.emplace_back("t/s");