mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-16 16:27:32 +03:00
Compare commits
3 Commits
b8733
...
gg/metal-r
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
bf8b39015f | ||
|
|
0d2038f90a | ||
|
|
76681e3c73 |
@@ -1464,6 +1464,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.swa_full = true;
|
||||
}
|
||||
).set_env("LLAMA_ARG_SWA_FULL"));
|
||||
add_opt(common_arg(
|
||||
{"--graph-reuse", "-gr"},
|
||||
string_format("reuse previous compute graphs when possible (default: %s)"
|
||||
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/14482)", params.graph_reuse ? "true" : "false"),
|
||||
[](common_params & params) {
|
||||
params.graph_reuse = true;
|
||||
}
|
||||
).set_env("LLAMA_ARG_GRAPH_REUSE"));
|
||||
add_opt(common_arg(
|
||||
{"--no-context-shift"},
|
||||
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
|
||||
|
||||
@@ -1157,6 +1157,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
||||
cparams.no_perf = params.no_perf;
|
||||
cparams.op_offload = !params.no_op_offload;
|
||||
cparams.swa_full = params.swa_full;
|
||||
cparams.graph_reuse = params.graph_reuse;
|
||||
|
||||
cparams.type_k = params.cache_type_k;
|
||||
cparams.type_v = params.cache_type_v;
|
||||
|
||||
@@ -330,6 +330,7 @@ struct common_params {
|
||||
bool no_perf = false; // disable performance metrics
|
||||
bool ctx_shift = true; // context shift on inifinite text generation
|
||||
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
||||
bool graph_reuse = false; // reuse previous compute graphs when possible
|
||||
|
||||
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||
bool use_mmap = true; // use mmap for faster loads
|
||||
|
||||
@@ -821,13 +821,23 @@ struct ggml_backend_metal_context {
|
||||
|
||||
// the callback given to the thread pool
|
||||
void (^encode_async)(size_t ith);
|
||||
void (^encode_next)(void);
|
||||
|
||||
// n_cb command buffers + 1 used by the main thread
|
||||
struct ggml_metal_command_buffer cmd_bufs[GGML_METAL_MAX_COMMAND_BUFFERS + 1];
|
||||
struct ggml_metal_command_buffer cmd_bufs_next[2];
|
||||
|
||||
// abort ggml_metal_graph_compute if callback returns true
|
||||
ggml_abort_callback abort_callback;
|
||||
void * abort_callback_data;
|
||||
|
||||
// reuse info
|
||||
int i_next;
|
||||
|
||||
int n_nodes_max;
|
||||
int n_nodes_prev;
|
||||
|
||||
struct ggml_tensor * cg_nodes;
|
||||
};
|
||||
|
||||
// MSL code
|
||||
@@ -1084,6 +1094,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
|
||||
ctx->gf = nil;
|
||||
ctx->encode_async = nil;
|
||||
ctx->encode_next = nil;
|
||||
for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
|
||||
ctx->cmd_bufs[i].obj = nil;
|
||||
|
||||
@@ -1091,6 +1102,13 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
ctx->cmd_bufs[i].mem_pool->device = device;
|
||||
}
|
||||
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
ctx->cmd_bufs_next[i].obj = nil;
|
||||
|
||||
ctx->cmd_bufs_next[i].mem_pool = ggml_metal_mem_pool_init();
|
||||
ctx->cmd_bufs_next[i].mem_pool->device = device;
|
||||
}
|
||||
|
||||
#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
|
||||
if (@available(macOS 10.12, iOS 16.0, *)) {
|
||||
GGML_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, device.recommendedMaxWorkingSetSize / 1e6);
|
||||
@@ -1521,6 +1539,13 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32, pool_2d_max_f32, true);
|
||||
}
|
||||
|
||||
ctx->i_next = 0;
|
||||
|
||||
ctx->n_nodes_max = 16384;
|
||||
ctx->n_nodes_prev = -1;
|
||||
|
||||
ctx->cg_nodes = ggml_aligned_malloc(ctx->n_nodes_max * sizeof(struct ggml_tensor));
|
||||
|
||||
return ctx;
|
||||
}
|
||||
|
||||
@@ -1532,6 +1557,7 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
|
||||
}
|
||||
|
||||
Block_release(ctx->encode_async);
|
||||
Block_release(ctx->encode_next);
|
||||
|
||||
[ctx->queue release];
|
||||
|
||||
@@ -1541,8 +1567,13 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
|
||||
ggml_metal_mem_pool_free(ctx->cmd_bufs[i].mem_pool);
|
||||
}
|
||||
|
||||
ggml_metal_mem_pool_free(ctx->cmd_bufs_next[0].mem_pool);
|
||||
ggml_metal_mem_pool_free(ctx->cmd_bufs_next[1].mem_pool);
|
||||
|
||||
dispatch_release(ctx->d_queue);
|
||||
|
||||
ggml_aligned_free(ctx->cg_nodes, ctx->n_nodes_max * sizeof(struct ggml_tensor));
|
||||
|
||||
free(ctx);
|
||||
}
|
||||
|
||||
@@ -5448,6 +5479,39 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
struct ggml_backend_metal_context * ctx = backend->context;
|
||||
struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
|
||||
|
||||
//const int64_t t_start = ggml_time_us();
|
||||
|
||||
/////////////////////////////////////////////////////
|
||||
// hacky way to determine that the graph is the same as the previous one
|
||||
//
|
||||
bool can_reuse = true;
|
||||
|
||||
if (gf->n_nodes > ctx->n_nodes_max) {
|
||||
can_reuse = false;
|
||||
}
|
||||
|
||||
if (gf->n_nodes != ctx->n_nodes_prev) {
|
||||
can_reuse = false;
|
||||
}
|
||||
|
||||
if (can_reuse) {
|
||||
for (int i = 0; i < gf->n_nodes; ++i) {
|
||||
if (memcmp(gf->nodes[i], ctx->cg_nodes + i, sizeof(struct ggml_tensor)) != 0) {
|
||||
can_reuse = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!can_reuse) {
|
||||
ctx->n_nodes_prev = gf->n_nodes;
|
||||
|
||||
for (int i = 0; i < gf->n_nodes; ++i) {
|
||||
memcpy(ctx->cg_nodes + i, gf->nodes[i], sizeof(struct ggml_tensor));
|
||||
}
|
||||
}
|
||||
//////////////////////////////////////////////////////
|
||||
|
||||
// number of nodes encoded by the main thread (empirically determined)
|
||||
const int n_main = 128;
|
||||
|
||||
@@ -5492,78 +5556,126 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
}
|
||||
}
|
||||
|
||||
// the main thread commits the first few commands immediately
|
||||
// cmd_buf[n_cb]
|
||||
{
|
||||
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
|
||||
ctx->cmd_bufs[n_cb].obj = cmd_buf;
|
||||
if (!can_reuse) {
|
||||
// the main thread commits the first few commands immediately
|
||||
// cmd_buf[n_cb]
|
||||
{
|
||||
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
|
||||
ctx->cmd_bufs[n_cb].obj = cmd_buf;
|
||||
|
||||
[cmd_buf enqueue];
|
||||
ctx->encode_async(n_cb);
|
||||
}
|
||||
|
||||
// prepare the rest of the command buffers asynchronously
|
||||
// cmd_buf[0.. n_cb)
|
||||
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
|
||||
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
|
||||
ctx->cmd_bufs[cb_idx].obj = cmd_buf;
|
||||
|
||||
// always enqueue the first two command buffers
|
||||
// enqueue all of the command buffers if we don't need to abort
|
||||
if (cb_idx < 2 || ctx->abort_callback == NULL) {
|
||||
[cmd_buf enqueue];
|
||||
ctx->encode_async(n_cb);
|
||||
}
|
||||
}
|
||||
|
||||
dispatch_apply(n_cb, ctx->d_queue, ctx->encode_async);
|
||||
// prepare the rest of the command buffers asynchronously
|
||||
// cmd_buf[0.. n_cb)
|
||||
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
|
||||
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
|
||||
ctx->cmd_bufs[cb_idx].obj = cmd_buf;
|
||||
|
||||
// wait for completion and check status of each command buffer
|
||||
// needed to detect if the device ran out-of-memory for example (#1881)
|
||||
{
|
||||
id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[n_cb].obj;
|
||||
[cmd_buf waitUntilCompleted];
|
||||
// always enqueue the first two command buffers
|
||||
// enqueue all of the command buffers if we don't need to abort
|
||||
if (cb_idx < 2 || ctx->abort_callback == NULL) {
|
||||
[cmd_buf enqueue];
|
||||
}
|
||||
}
|
||||
|
||||
MTLCommandBufferStatus status = [cmd_buf status];
|
||||
if (status != MTLCommandBufferStatusCompleted) {
|
||||
GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status);
|
||||
if (status == MTLCommandBufferStatusError) {
|
||||
GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
|
||||
dispatch_apply(n_cb, ctx->d_queue, ctx->encode_async);
|
||||
|
||||
// encode the command buffer for the next iter while the GPU has already started
|
||||
{
|
||||
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
|
||||
[cmd_buf retain];
|
||||
if (ctx->cmd_bufs_next[ctx->i_next].obj != nil) {
|
||||
[ctx->cmd_bufs_next[ctx->i_next].obj release];
|
||||
}
|
||||
ctx->cmd_bufs_next[ctx->i_next].obj = cmd_buf;
|
||||
|
||||
ctx->encode_next();
|
||||
}
|
||||
|
||||
// wait for completion and check status of each command buffer
|
||||
// needed to detect if the device ran out-of-memory for example (#1881)
|
||||
{
|
||||
id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[n_cb].obj;
|
||||
[cmd_buf waitUntilCompleted];
|
||||
|
||||
MTLCommandBufferStatus status = [cmd_buf status];
|
||||
if (status != MTLCommandBufferStatusCompleted) {
|
||||
GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status);
|
||||
if (status == MTLCommandBufferStatusError) {
|
||||
GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
|
||||
}
|
||||
|
||||
return GGML_STATUS_FAILED;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_cb; ++i) {
|
||||
id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[i].obj;
|
||||
[cmd_buf waitUntilCompleted];
|
||||
|
||||
MTLCommandBufferStatus status = [cmd_buf status];
|
||||
if (status != MTLCommandBufferStatusCompleted) {
|
||||
GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
||||
if (status == MTLCommandBufferStatusError) {
|
||||
GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
|
||||
}
|
||||
|
||||
return GGML_STATUS_FAILED;
|
||||
}
|
||||
|
||||
return GGML_STATUS_FAILED;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_cb; ++i) {
|
||||
id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[i].obj;
|
||||
[cmd_buf waitUntilCompleted];
|
||||
|
||||
MTLCommandBufferStatus status = [cmd_buf status];
|
||||
if (status != MTLCommandBufferStatusCompleted) {
|
||||
GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
||||
if (status == MTLCommandBufferStatusError) {
|
||||
GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
|
||||
id<MTLCommandBuffer> next_buffer = (i + 1 < n_cb ? ctx->cmd_bufs[i + 1].obj : nil);
|
||||
if (!next_buffer) {
|
||||
continue;
|
||||
}
|
||||
|
||||
return GGML_STATUS_FAILED;
|
||||
const bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued);
|
||||
if (next_queued) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) {
|
||||
GGML_LOG_INFO("%s: command buffer %d aborted", __func__, i);
|
||||
return GGML_STATUS_ABORTED;
|
||||
}
|
||||
|
||||
[next_buffer commit];
|
||||
}
|
||||
} else {
|
||||
struct ggml_metal_command_buffer cmd_buf_cur = ctx->cmd_bufs_next[(ctx->i_next + 1)%2];
|
||||
|
||||
// directly submit the command buffer that we have prepared in the previous iteration
|
||||
[ctx->cmd_bufs_next[(ctx->i_next + 1)%2].obj commit];
|
||||
|
||||
// encode the command buffer for the next iter
|
||||
{
|
||||
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
|
||||
[cmd_buf retain];
|
||||
if (ctx->cmd_bufs_next[ctx->i_next].obj != nil) {
|
||||
[ctx->cmd_bufs_next[ctx->i_next].obj release];
|
||||
}
|
||||
ctx->cmd_bufs_next[ctx->i_next].obj = cmd_buf;
|
||||
|
||||
ctx->encode_next();
|
||||
}
|
||||
|
||||
id<MTLCommandBuffer> next_buffer = (i + 1 < n_cb ? ctx->cmd_bufs[i + 1].obj : nil);
|
||||
if (!next_buffer) {
|
||||
continue;
|
||||
}
|
||||
// wait for completion and check status of each command buffer
|
||||
// needed to detect if the device ran out-of-memory for example (#1881)
|
||||
{
|
||||
id<MTLCommandBuffer> cmd_buf = cmd_buf_cur.obj;
|
||||
[cmd_buf waitUntilCompleted];
|
||||
|
||||
const bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued);
|
||||
if (next_queued) {
|
||||
continue;
|
||||
}
|
||||
MTLCommandBufferStatus status = [cmd_buf status];
|
||||
if (status != MTLCommandBufferStatusCompleted) {
|
||||
GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, ctx->i_next, status);
|
||||
if (status == MTLCommandBufferStatusError) {
|
||||
GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
|
||||
}
|
||||
|
||||
if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) {
|
||||
GGML_LOG_INFO("%s: command buffer %d aborted", __func__, i);
|
||||
return GGML_STATUS_ABORTED;
|
||||
return GGML_STATUS_FAILED;
|
||||
}
|
||||
}
|
||||
|
||||
[next_buffer commit];
|
||||
}
|
||||
|
||||
if (!should_capture && ctx->capture_started) {
|
||||
@@ -5572,6 +5684,8 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
}
|
||||
}
|
||||
|
||||
//printf(" time = %.3f ms\n", (float)(ggml_time_us() - t_start)/1000.0f);
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -5919,6 +6033,10 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
|
||||
Block_release(ctx->encode_async);
|
||||
}
|
||||
|
||||
if (ctx->encode_next) {
|
||||
Block_release(ctx->encode_next);
|
||||
}
|
||||
|
||||
ctx->encode_async = Block_copy(^(size_t iter) {
|
||||
const int cb_idx = iter;
|
||||
const int n_cb_l = ctx->n_cb;
|
||||
@@ -5967,6 +6085,40 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
|
||||
[cmd_buf commit];
|
||||
}
|
||||
});
|
||||
|
||||
ctx->encode_next = Block_copy(^(void) {
|
||||
id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs_next[ctx->i_next].obj;
|
||||
|
||||
id<MTLComputeCommandEncoder> encoder = [cmd_buf computeCommandEncoder];
|
||||
|
||||
int node_start = 0;
|
||||
int node_end = ctx->gf->n_nodes;
|
||||
|
||||
const bool should_capture = ctx->capture_next_compute;
|
||||
|
||||
struct ggml_metal_mem_pool * mem_pool = ctx->cmd_bufs_next[ctx->i_next].mem_pool;
|
||||
ggml_metal_mem_pool_reset(mem_pool);
|
||||
|
||||
for (int idx = node_start; idx < node_end; ++idx) {
|
||||
if (should_capture) {
|
||||
[encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
|
||||
}
|
||||
|
||||
const bool res = ggml_metal_encode_node(backend, idx, encoder, mem_pool);
|
||||
|
||||
if (should_capture) {
|
||||
[encoder popDebugGroup];
|
||||
}
|
||||
|
||||
if (!res) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
[encoder endEncoding];
|
||||
|
||||
ctx->i_next = (ctx->i_next + 1) % 2;
|
||||
});
|
||||
}
|
||||
|
||||
static struct ggml_backend_i ggml_backend_metal_i = {
|
||||
|
||||
@@ -374,6 +374,8 @@ extern "C" {
|
||||
bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
||||
// NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
|
||||
|
||||
bool graph_reuse; // reuse previous compute graphs when possible
|
||||
};
|
||||
|
||||
// model quantization parameters
|
||||
@@ -1429,6 +1431,7 @@ extern "C" {
|
||||
|
||||
int32_t n_p_eval;
|
||||
int32_t n_eval;
|
||||
int32_t n_reused;
|
||||
};
|
||||
|
||||
struct llama_perf_sampler_data {
|
||||
|
||||
@@ -34,6 +34,31 @@ struct llama_ubatch {
|
||||
llama_seq_id * seq_id_unq; // [n_seqs_unq] | s | seq_id
|
||||
int32_t * seq_idx; // [LLAMA_MAX_SEQ] | - | seq_idx
|
||||
int8_t * output; // [n_tokens] | i | -
|
||||
|
||||
bool is_same(const llama_ubatch & other) const {
|
||||
bool res =
|
||||
equal_seqs == other.equal_seqs &&
|
||||
n_tokens == other.n_tokens &&
|
||||
n_seq_tokens == other.n_seq_tokens &&
|
||||
n_seqs == other.n_seqs &&
|
||||
n_seqs_unq == other.n_seqs_unq &&
|
||||
(
|
||||
(!token && !other.token) ||
|
||||
(!embd && !other.embd)
|
||||
);
|
||||
|
||||
if (!res) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// TODO: this won't work because seq_id_unq ptr can point to an old balloc that has
|
||||
// been freed by this point. find a way to fix this
|
||||
//for (uint32_t s = 0; s < n_seqs_unq; ++s) {
|
||||
// res &= seq_id_unq[s] == other.seq_id_unq[s];
|
||||
//}
|
||||
|
||||
return res;
|
||||
}
|
||||
};
|
||||
|
||||
// a helper for sanitizing, fulfilling and splitting a batch
|
||||
|
||||
@@ -101,7 +101,8 @@ llama_context::llama_context(
|
||||
|
||||
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
||||
|
||||
cparams.op_offload = params.op_offload;
|
||||
cparams.op_offload = params.op_offload;
|
||||
cparams.graph_reuse = params.graph_reuse;
|
||||
|
||||
const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
|
||||
|
||||
@@ -227,8 +228,8 @@ llama_context::llama_context(
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);
|
||||
|
||||
// buffer used to store the computation graph and the tensor meta data
|
||||
buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
|
||||
gf_res_prev.reset(new llm_graph_result(max_nodes));
|
||||
gf_res_reserve.reset(new llm_graph_result(max_nodes));
|
||||
|
||||
// TODO: move these checks to ggml_backend_sched
|
||||
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
||||
@@ -388,10 +389,6 @@ ggml_backend_sched_t llama_context::get_sched() const {
|
||||
return sched.get();
|
||||
}
|
||||
|
||||
ggml_context * llama_context::get_ctx_compute() const {
|
||||
return ctx_compute.get();
|
||||
}
|
||||
|
||||
uint32_t llama_context::n_ctx() const {
|
||||
return cparams.n_ctx;
|
||||
}
|
||||
@@ -678,38 +675,52 @@ bool llama_context::apply_adapter_cvec(
|
||||
return cvec.apply(model, data, len, n_embd, il_start, il_end);
|
||||
}
|
||||
|
||||
llm_graph_result_ptr llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
|
||||
llm_graph_result_i * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
|
||||
if (mctx && !mctx->apply()) {
|
||||
LLAMA_LOG_ERROR("%s: failed to apply memory context\n", __func__);
|
||||
ret = GGML_STATUS_FAILED;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto * gf = graph_init();
|
||||
if (!gf) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize graph\n", __func__);
|
||||
ret = GGML_STATUS_FAILED;
|
||||
return nullptr;
|
||||
}
|
||||
auto * res = gf_res_prev.get();
|
||||
auto * gf = res->get_gf();
|
||||
|
||||
auto res = graph_build(ctx_compute.get(), gf, ubatch, gtype, mctx);
|
||||
if (!res) {
|
||||
LLAMA_LOG_ERROR("%s: failed to build graph\n", __func__);
|
||||
ret = GGML_STATUS_FAILED;
|
||||
return nullptr;
|
||||
}
|
||||
// the new graph parameters
|
||||
// in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters
|
||||
const auto gparams = graph_params(res, ubatch, mctx, gtype);
|
||||
|
||||
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
||||
const bool can_reuse = cparams.graph_reuse && res->update(gparams);
|
||||
if (can_reuse) {
|
||||
LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
|
||||
n_reused++;
|
||||
} else {
|
||||
res->reset();
|
||||
|
||||
if (!ggml_backend_sched_alloc_graph(sched.get(), gf)) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate graph\n", __func__);
|
||||
ret = GGML_STATUS_ALLOC_FAILED;
|
||||
return nullptr;
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
|
||||
|
||||
//const auto t_start_us = ggml_time_us();
|
||||
|
||||
gf = model.build_graph(gparams);
|
||||
|
||||
//LLAMA_LOG_INFO("graph build time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0);
|
||||
|
||||
if (!gf) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize graph\n", __func__);
|
||||
ret = GGML_STATUS_FAILED;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (!ggml_backend_sched_alloc_graph(sched.get(), gf)) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate graph\n", __func__);
|
||||
ret = GGML_STATUS_ALLOC_FAILED;
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
res->set_inputs(&ubatch);
|
||||
|
||||
const auto status = graph_compute(gf, ubatch.n_tokens > 1);
|
||||
const auto status = graph_compute(res->get_gf(), ubatch.n_tokens > 1);
|
||||
if (status != GGML_STATUS_SUCCESS) {
|
||||
LLAMA_LOG_ERROR("%s: failed to compute graph, compute status: %d\n", __func__, status);
|
||||
ret = status;
|
||||
@@ -767,9 +778,6 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
||||
|
||||
n_outputs = n_tokens;
|
||||
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
|
||||
|
||||
const auto causal_attn_org = cparams.causal_attn;
|
||||
|
||||
// always use non-causal attention for encoder graphs
|
||||
@@ -778,7 +786,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
||||
cparams.causal_attn = false;
|
||||
|
||||
ggml_status status;
|
||||
const auto res = process_ubatch(ubatch, LLM_GRAPH_TYPE_ENCODER, nullptr, status);
|
||||
const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_ENCODER, nullptr, status);
|
||||
|
||||
cparams.causal_attn = causal_attn_org;
|
||||
|
||||
@@ -846,7 +854,9 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
||||
|
||||
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
|
||||
// overlap with device computation.
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
if (!cparams.graph_reuse) {
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
}
|
||||
|
||||
// TODO: hacky solution
|
||||
if (model.arch == LLM_ARCH_T5 && t_embd) {
|
||||
@@ -1005,11 +1015,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
n_outputs = n_outputs_new;
|
||||
}
|
||||
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
|
||||
|
||||
ggml_status status;
|
||||
const auto res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status);
|
||||
const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status);
|
||||
|
||||
if (!res) {
|
||||
// the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache
|
||||
@@ -1192,7 +1199,9 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
|
||||
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
|
||||
// overlap with device computation.
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
if (!cparams.graph_reuse) {
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1275,20 +1284,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
// graph
|
||||
//
|
||||
|
||||
int32_t llama_context::graph_max_nodes() const {
|
||||
return std::max<int32_t>(65536, 5*model.n_tensors());
|
||||
}
|
||||
|
||||
ggml_cgraph * llama_context::graph_init() {
|
||||
ggml_init_params params = {
|
||||
/*.mem_size =*/ buf_compute_meta.size(),
|
||||
/*.mem_buffer =*/ buf_compute_meta.data(),
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
ctx_compute.reset(ggml_init(params));
|
||||
|
||||
return ggml_new_graph_custom(ctx_compute.get(), graph_max_nodes(), false);
|
||||
uint32_t llama_context::graph_max_nodes() const {
|
||||
return std::max<uint32_t>(65536u, 5u*model.n_tensors());
|
||||
}
|
||||
|
||||
ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx) {
|
||||
@@ -1301,6 +1298,9 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
|
||||
LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs);
|
||||
}
|
||||
|
||||
gf_res_prev->reset();
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
|
||||
// store the n_outputs as it is, and restore it afterwards
|
||||
// TODO: not sure if needed, might simplify in the future by removing this
|
||||
const auto save_n_outputs = this->n_outputs;
|
||||
@@ -1310,18 +1310,16 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
|
||||
llama_batch_allocr balloc(model.hparams.n_pos_per_embd());
|
||||
llama_ubatch ubatch = balloc.ubatch_reserve(n_tokens/n_seqs, n_seqs);
|
||||
|
||||
auto * gf = graph_init();
|
||||
auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx);
|
||||
auto * res = gf_res_reserve.get();
|
||||
|
||||
const auto gparams = graph_params(res, ubatch, mctx, LLM_GRAPH_TYPE_DEFAULT);
|
||||
|
||||
res->reset();
|
||||
|
||||
auto * gf = model.build_graph(gparams);
|
||||
|
||||
this->n_outputs = save_n_outputs;
|
||||
|
||||
if (!res) {
|
||||
LLAMA_LOG_ERROR("%s: failed to build worst-case graph\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
|
||||
// initialize scheduler with the specified graph
|
||||
if (!ggml_backend_sched_reserve(sched.get(), gf)) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
||||
@@ -1331,28 +1329,27 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
|
||||
return gf;
|
||||
}
|
||||
|
||||
llm_graph_result_ptr llama_context::graph_build(
|
||||
ggml_context * ctx,
|
||||
ggml_cgraph * gf,
|
||||
const llama_ubatch & ubatch,
|
||||
llm_graph_type gtype,
|
||||
const llama_memory_context_i * mctx) {
|
||||
return model.build_graph(
|
||||
{
|
||||
/*.ctx =*/ ctx,
|
||||
/*.arch =*/ model.arch,
|
||||
/*.hparams =*/ model.hparams,
|
||||
/*.cparams =*/ cparams,
|
||||
/*.ubatch =*/ ubatch,
|
||||
/*.sched =*/ sched.get(),
|
||||
/*.backend_cpu =*/ backend_cpu,
|
||||
/*.cvec =*/ &cvec,
|
||||
/*.loras =*/ &loras,
|
||||
/*.mctx =*/ mctx,
|
||||
/*.cross =*/ &cross,
|
||||
/*.n_outputs =*/ n_outputs,
|
||||
/*.cb =*/ graph_get_cb(),
|
||||
}, gf, gtype);
|
||||
llm_graph_params llama_context::graph_params(
|
||||
llm_graph_result_i * res,
|
||||
const llama_ubatch & ubatch,
|
||||
const llama_memory_context_i * mctx,
|
||||
llm_graph_type gtype) const {
|
||||
return {
|
||||
/*.arch =*/ model.arch,
|
||||
/*.hparams =*/ model.hparams,
|
||||
/*.cparams =*/ cparams,
|
||||
/*.ubatch =*/ ubatch,
|
||||
/*.gtype =*/ gtype,
|
||||
/*.sched =*/ sched.get(),
|
||||
/*.backend_cpu =*/ backend_cpu,
|
||||
/*.cvec =*/ &cvec,
|
||||
/*.loras =*/ &loras,
|
||||
/*.mctx =*/ mctx,
|
||||
/*.cross =*/ &cross,
|
||||
/*.n_outputs =*/ n_outputs,
|
||||
/*.cb =*/ graph_get_cb(),
|
||||
/*.res =*/ res,
|
||||
};
|
||||
}
|
||||
|
||||
ggml_status llama_context::graph_compute(
|
||||
@@ -1930,6 +1927,7 @@ llama_perf_context_data llama_context::perf_get_data() const {
|
||||
data.t_eval_ms = 1e-3 * t_eval_us;
|
||||
data.n_p_eval = std::max(1, n_p_eval);
|
||||
data.n_eval = std::max(1, n_eval);
|
||||
data.n_reused = std::max(0, n_reused);
|
||||
|
||||
return data;
|
||||
}
|
||||
@@ -1938,6 +1936,7 @@ void llama_context::perf_reset() {
|
||||
t_start_us = ggml_time_us();
|
||||
t_eval_us = n_eval = 0;
|
||||
t_p_eval_us = n_p_eval = 0;
|
||||
n_reused = 0;
|
||||
}
|
||||
|
||||
//
|
||||
@@ -2064,8 +2063,13 @@ void llama_context::opt_epoch_iter(
|
||||
break;
|
||||
}
|
||||
|
||||
auto * gf = graph_init();
|
||||
auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx.get());
|
||||
auto * res = gf_res_prev.get();
|
||||
|
||||
const auto gparams = graph_params(res, ubatch, mctx.get(), LLM_GRAPH_TYPE_DEFAULT);
|
||||
|
||||
res->reset();
|
||||
|
||||
auto * gf = model.build_graph(gparams);
|
||||
|
||||
struct ggml_context * ctx_compute_opt;
|
||||
{
|
||||
@@ -2187,6 +2191,7 @@ llama_context_params llama_context_default_params() {
|
||||
/*.no_perf =*/ true,
|
||||
/*.op_offload =*/ true,
|
||||
/*.swa_full =*/ true,
|
||||
/*.graph_reuse =*/ false,
|
||||
};
|
||||
|
||||
return result;
|
||||
@@ -2807,6 +2812,7 @@ void llama_perf_context_print(const llama_context * ctx) {
|
||||
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
__func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
|
||||
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
|
||||
LLAMA_LOG_INFO("%s: graphs reused = %10d\n", __func__, data.n_reused);
|
||||
}
|
||||
|
||||
void llama_perf_context_reset(llama_context * ctx) {
|
||||
|
||||
@@ -35,8 +35,6 @@ struct llama_context {
|
||||
|
||||
ggml_backend_sched_t get_sched() const;
|
||||
|
||||
ggml_context * get_ctx_compute() const;
|
||||
|
||||
uint32_t n_ctx() const;
|
||||
uint32_t n_ctx_per_seq() const;
|
||||
uint32_t n_batch() const;
|
||||
@@ -96,7 +94,7 @@ struct llama_context {
|
||||
// if memory_context is provided, it will be applied first to the context's memory
|
||||
// ret contains the status of the graph computation
|
||||
// returns nullptr only if ret != GGML_STATUS_SUCCESS
|
||||
llm_graph_result_ptr process_ubatch(
|
||||
llm_graph_result_i * process_ubatch(
|
||||
const llama_ubatch & ubatch,
|
||||
llm_graph_type gtype,
|
||||
llama_memory_context_i * mctx,
|
||||
@@ -188,10 +186,7 @@ private:
|
||||
//
|
||||
|
||||
public:
|
||||
int32_t graph_max_nodes() const;
|
||||
|
||||
// zero-out inputs and create the ctx_compute for the compute graph
|
||||
ggml_cgraph * graph_init();
|
||||
uint32_t graph_max_nodes() const;
|
||||
|
||||
// returns the result of ggml_backend_sched_graph_compute_async execution
|
||||
ggml_status graph_compute(ggml_cgraph * gf, bool batched);
|
||||
@@ -200,12 +195,11 @@ public:
|
||||
ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx);
|
||||
|
||||
private:
|
||||
llm_graph_result_ptr graph_build(
|
||||
ggml_context * ctx,
|
||||
ggml_cgraph * gf,
|
||||
const llama_ubatch & ubatch,
|
||||
llm_graph_type gtype,
|
||||
const llama_memory_context_i * mctx);
|
||||
llm_graph_params graph_params(
|
||||
llm_graph_result_i * res,
|
||||
const llama_ubatch & ubatch,
|
||||
const llama_memory_context_i * mctx,
|
||||
llm_graph_type gtype) const;
|
||||
|
||||
llm_graph_cb graph_get_cb() const;
|
||||
|
||||
@@ -258,8 +252,6 @@ private:
|
||||
ggml_backend_t backend_cpu = nullptr;
|
||||
std::vector<ggml_backend_ptr> backends;
|
||||
|
||||
ggml_context_ptr ctx_compute;
|
||||
|
||||
// training
|
||||
ggml_opt_context_t opt_ctx = nullptr;
|
||||
|
||||
@@ -275,8 +267,8 @@ private:
|
||||
std::vector<ggml_backend_t> backend_ptrs;
|
||||
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
||||
|
||||
// memory buffers used to evaluate the model
|
||||
std::vector<uint8_t> buf_compute_meta;
|
||||
llm_graph_result_ptr gf_res_prev;
|
||||
llm_graph_result_ptr gf_res_reserve;
|
||||
|
||||
// host buffer for the model output (logits and embeddings)
|
||||
ggml_backend_buffer_ptr buf_output;
|
||||
@@ -294,4 +286,6 @@ private:
|
||||
|
||||
mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
||||
mutable int32_t n_eval = 0; // number of eval calls
|
||||
|
||||
mutable int32_t n_reused = 0; // number of times the previous graph was reused
|
||||
};
|
||||
|
||||
@@ -3,3 +3,32 @@
|
||||
size_t llama_max_parallel_sequences(void) {
|
||||
return LLAMA_MAX_SEQ;
|
||||
}
|
||||
|
||||
bool llama_cparams::is_same(const llama_cparams & other) const {
|
||||
return
|
||||
n_ctx == other.n_ctx &&
|
||||
n_batch == other.n_batch &&
|
||||
n_ubatch == other.n_ubatch &&
|
||||
n_seq_max == other.n_seq_max &&
|
||||
n_threads == other.n_threads &&
|
||||
n_threads_batch == other.n_threads_batch &&
|
||||
rope_freq_base == other.rope_freq_base &&
|
||||
rope_freq_scale == other.rope_freq_scale &&
|
||||
n_ctx_orig_yarn == other.n_ctx_orig_yarn &&
|
||||
yarn_ext_factor == other.yarn_ext_factor &&
|
||||
yarn_attn_factor == other.yarn_attn_factor &&
|
||||
yarn_beta_fast == other.yarn_beta_fast &&
|
||||
yarn_beta_slow == other.yarn_beta_slow &&
|
||||
defrag_thold == other.defrag_thold &&
|
||||
embeddings == other.embeddings &&
|
||||
causal_attn == other.causal_attn &&
|
||||
offload_kqv == other.offload_kqv &&
|
||||
flash_attn == other.flash_attn &&
|
||||
no_perf == other.no_perf &&
|
||||
warmup == other.warmup &&
|
||||
op_offload == other.op_offload &&
|
||||
graph_reuse == other.graph_reuse &&
|
||||
pooling_type == other.pooling_type &&
|
||||
cb_eval == other.cb_eval &&
|
||||
cb_eval_user_data == other.cb_eval_user_data;
|
||||
}
|
||||
|
||||
@@ -33,9 +33,12 @@ struct llama_cparams {
|
||||
bool no_perf;
|
||||
bool warmup;
|
||||
bool op_offload;
|
||||
bool graph_reuse;
|
||||
|
||||
enum llama_pooling_type pooling_type;
|
||||
|
||||
ggml_backend_sched_eval_callback cb_eval;
|
||||
void * cb_eval_user_data;
|
||||
|
||||
bool is_same(const llama_cparams & other) const;
|
||||
};
|
||||
|
||||
@@ -28,6 +28,15 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
|
||||
}
|
||||
}
|
||||
|
||||
bool llm_graph_input_embd::update(const llm_graph_params & params) {
|
||||
bool res = true;
|
||||
|
||||
res &= (!tokens && !params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens);
|
||||
res &= (!embd && !params.ubatch.embd) || (embd && embd->ne[0] == params.ubatch.n_tokens);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
|
||||
if (ubatch->pos && pos) {
|
||||
const int64_t n_tokens = ubatch->n_tokens;
|
||||
@@ -50,6 +59,14 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
|
||||
}
|
||||
}
|
||||
|
||||
bool llm_graph_input_pos::update(const llm_graph_params & params) {
|
||||
bool res = true;
|
||||
|
||||
res &= pos->ne[0] == params.ubatch.n_tokens;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
|
||||
if (ubatch->pos && attn_scale) {
|
||||
const int64_t n_tokens = ubatch->n_tokens;
|
||||
@@ -118,6 +135,14 @@ void llm_graph_input_out_ids::set_input(const llama_ubatch * ubatch) {
|
||||
}
|
||||
}
|
||||
|
||||
bool llm_graph_input_out_ids::update(const llm_graph_params & params) {
|
||||
bool res = true;
|
||||
|
||||
res &= n_outputs == params.n_outputs;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
|
||||
if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
||||
const int64_t n_tokens = ubatch->n_tokens;
|
||||
@@ -287,6 +312,24 @@ void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
|
||||
mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
|
||||
}
|
||||
|
||||
bool llm_graph_input_attn_kv_unified::update(const llm_graph_params & params) {
|
||||
const auto * mctx = static_cast<const llama_kv_cache_unified_context *>(params.mctx);
|
||||
|
||||
this->mctx = mctx;
|
||||
|
||||
bool res = true;
|
||||
|
||||
res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
|
||||
res &= self_v_idxs->ne[0] == params.ubatch.n_tokens;
|
||||
|
||||
res &= self_kq_mask->ne[0] == mctx->get_n_kv();
|
||||
res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
|
||||
|
||||
res &= mctx->get_supports_set_rows(); // TODO: tmp
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch) {
|
||||
mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
|
||||
mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
|
||||
@@ -299,6 +342,30 @@ void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch
|
||||
mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
|
||||
}
|
||||
|
||||
bool llm_graph_input_attn_kv_unified_iswa::update(const llm_graph_params & params) {
|
||||
const auto * mctx = static_cast<const llama_kv_cache_unified_iswa_context *>(params.mctx);
|
||||
|
||||
this->mctx = mctx;
|
||||
|
||||
bool res = true;
|
||||
|
||||
res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
|
||||
res &= self_v_idxs->ne[0] == params.ubatch.n_tokens;
|
||||
|
||||
res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
|
||||
res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens;
|
||||
|
||||
res &= self_kq_mask->ne[0] == mctx->get_base()->get_n_kv();
|
||||
res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
|
||||
|
||||
res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv();
|
||||
res &= self_kq_mask_swa->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
|
||||
|
||||
res &= mctx->get_base()->get_supports_set_rows(); // TODO: tmp
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
|
||||
GGML_ASSERT(cross_kq_mask);
|
||||
|
||||
@@ -395,7 +462,6 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
|
||||
n_ctx_orig (cparams.n_ctx_orig_yarn),
|
||||
pooling_type (cparams.pooling_type),
|
||||
rope_type (hparams.rope_type),
|
||||
ctx0 (params.ctx),
|
||||
sched (params.sched),
|
||||
backend_cpu (params.backend_cpu),
|
||||
cvec (params.cvec),
|
||||
@@ -403,7 +469,8 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
|
||||
mctx (params.mctx),
|
||||
cross (params.cross),
|
||||
cb_func (params.cb),
|
||||
res (std::make_unique<llm_graph_result>()) {
|
||||
res (static_cast<llm_graph_result *>(params.res)),
|
||||
ctx0 (res->get_ctx()) {
|
||||
}
|
||||
|
||||
void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "llama-arch.h"
|
||||
#include "llama-batch.h"
|
||||
#include "llama-hparams.h"
|
||||
#include "llama-adapter.h"
|
||||
|
||||
@@ -14,7 +15,6 @@ struct ggml_cgraph;
|
||||
struct ggml_context;
|
||||
struct ggml_tensor;
|
||||
|
||||
struct llama_ubatch;
|
||||
struct llama_cparams;
|
||||
|
||||
struct llama_memory_context_i;
|
||||
@@ -69,6 +69,8 @@ struct llama_cross {
|
||||
std::vector<std::set<llama_seq_id>> seq_ids_enc;
|
||||
};
|
||||
|
||||
struct llm_graph_params;
|
||||
|
||||
//
|
||||
// llm_graph_input
|
||||
//
|
||||
@@ -78,11 +80,19 @@ public:
|
||||
virtual ~llm_graph_input_i() = default;
|
||||
|
||||
virtual void set_input(const llama_ubatch * ubatch) = 0;
|
||||
|
||||
// return true if the resulting input tensors using the provided graph parameters would be
|
||||
// the same as the previous input tensors that we have currently stored in the object
|
||||
virtual bool update(const llm_graph_params & params) {
|
||||
// returning false here by default will prevent from reusing the graph if the check
|
||||
// for the input type has not been implemented yet
|
||||
GGML_UNUSED(params);
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
|
||||
|
||||
|
||||
class llm_graph_input_embd : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_embd() = default;
|
||||
@@ -90,6 +100,8 @@ public:
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
bool update(const llm_graph_params & params) override;
|
||||
|
||||
ggml_tensor * tokens = nullptr; // I32 [n_batch]
|
||||
ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch]
|
||||
};
|
||||
@@ -101,6 +113,8 @@ public:
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
bool update(const llm_graph_params & params) override;
|
||||
|
||||
ggml_tensor * pos = nullptr; // I32 [n_batch]
|
||||
|
||||
const uint32_t n_pos_per_embd = 1;
|
||||
@@ -154,17 +168,19 @@ public:
|
||||
llm_graph_input_out_ids(
|
||||
const llama_hparams & hparams,
|
||||
const llama_cparams & cparams,
|
||||
int32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {}
|
||||
uint32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {}
|
||||
virtual ~llm_graph_input_out_ids() = default;
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
bool update(const llm_graph_params & params) override;
|
||||
|
||||
ggml_tensor * out_ids; // I32 [n_outputs]
|
||||
|
||||
const llama_hparams & hparams;
|
||||
const llama_cparams & cparams;
|
||||
|
||||
const int32_t n_outputs;
|
||||
const uint32_t n_outputs;
|
||||
};
|
||||
|
||||
class llm_graph_input_mean : public llm_graph_input_i {
|
||||
@@ -249,6 +265,8 @@ public:
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
bool update(const llm_graph_params & params) override;
|
||||
|
||||
ggml_tensor * get_k_idxs() const { return self_k_idxs; }
|
||||
ggml_tensor * get_v_idxs() const { return self_v_idxs; }
|
||||
|
||||
@@ -280,6 +298,8 @@ public:
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
bool update(const llm_graph_params & params) override;
|
||||
|
||||
ggml_tensor * get_k_idxs() const { return self_k_idxs; }
|
||||
ggml_tensor * get_v_idxs() const { return self_v_idxs; }
|
||||
ggml_tensor * get_k_idxs_swa() const { return self_k_idxs_swa; }
|
||||
@@ -373,29 +393,110 @@ public:
|
||||
// along with the input tensors, the object also provides commonly used outputs tensors, such as logits, embeddings, etc.
|
||||
// these are used by the llama_context to extact the relevant data, based on the compute parameters
|
||||
|
||||
// TODO: this interface seems redundant - remove it
|
||||
class llm_graph_result_i {
|
||||
public:
|
||||
virtual ~llm_graph_result_i() = default;
|
||||
|
||||
virtual ggml_tensor * get_tokens() = 0;
|
||||
virtual ggml_tensor * get_logits() = 0;
|
||||
virtual ggml_tensor * get_embd() = 0;
|
||||
virtual ggml_tensor * get_embd_pooled() = 0;
|
||||
virtual ggml_tensor * get_tokens() const = 0;
|
||||
virtual ggml_tensor * get_logits() const = 0;
|
||||
virtual ggml_tensor * get_embd() const = 0;
|
||||
virtual ggml_tensor * get_embd_pooled() const = 0;
|
||||
|
||||
virtual ggml_cgraph * get_gf() = 0;
|
||||
virtual ggml_context * get_ctx() = 0;
|
||||
|
||||
virtual void reset() = 0;
|
||||
|
||||
virtual void set_inputs(const llama_ubatch * ubatch) = 0;
|
||||
|
||||
virtual bool update(const llm_graph_params & params) = 0;
|
||||
};
|
||||
|
||||
using llm_graph_result_ptr = std::unique_ptr<llm_graph_result_i>;
|
||||
|
||||
// callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
|
||||
using llm_graph_cb = std::function<void(const llama_ubatch & ubatch, ggml_tensor * cur, const char * name, int il)>;
|
||||
|
||||
struct llm_graph_params {
|
||||
llm_arch arch = LLM_ARCH_UNKNOWN;
|
||||
|
||||
llama_hparams hparams;
|
||||
llama_cparams cparams;
|
||||
|
||||
llama_ubatch ubatch; // note: intentionally make a copy
|
||||
|
||||
llm_graph_type gtype;
|
||||
|
||||
ggml_backend_sched_t sched;
|
||||
ggml_backend_t backend_cpu;
|
||||
|
||||
const llama_adapter_cvec * cvec;
|
||||
const llama_adapter_loras * loras;
|
||||
const llama_memory_context_i * mctx;
|
||||
const llama_cross * cross;
|
||||
|
||||
uint32_t n_outputs;
|
||||
|
||||
llm_graph_cb cb;
|
||||
|
||||
// TODO: temporary
|
||||
llm_graph_result_i * res;
|
||||
|
||||
bool is_same(const llm_graph_params & other) const {
|
||||
return
|
||||
hparams.is_same(other.hparams) &&
|
||||
cparams.is_same(other.cparams) &&
|
||||
ubatch .is_same(other.ubatch) &&
|
||||
arch == other.arch &&
|
||||
gtype == other.gtype &&
|
||||
cvec == other.cvec &&
|
||||
loras == other.loras &&
|
||||
cross == other.cross &&
|
||||
n_outputs == other.n_outputs;
|
||||
}
|
||||
};
|
||||
|
||||
class llm_graph_result : public llm_graph_result_i {
|
||||
public:
|
||||
llm_graph_result(int64_t max_nodes) : max_nodes(max_nodes) {
|
||||
reset();
|
||||
}
|
||||
|
||||
virtual ~llm_graph_result() = default;
|
||||
|
||||
ggml_tensor * get_tokens() override { return t_tokens; }
|
||||
ggml_tensor * get_logits() override { return t_logits; }
|
||||
ggml_tensor * get_embd() override { return t_embd; }
|
||||
ggml_tensor * get_embd_pooled() override { return t_embd_pooled; }
|
||||
ggml_tensor * get_tokens() const override { return t_tokens; }
|
||||
ggml_tensor * get_logits() const override { return t_logits; }
|
||||
ggml_tensor * get_embd() const override { return t_embd; }
|
||||
ggml_tensor * get_embd_pooled() const override { return t_embd_pooled; }
|
||||
|
||||
ggml_cgraph * get_gf() override { return gf; }
|
||||
ggml_context * get_ctx() override { return ctx_compute.get(); }
|
||||
|
||||
void set_max_nodes(int64_t max_nodes) {
|
||||
this->max_nodes = max_nodes;
|
||||
}
|
||||
|
||||
void reset() override {
|
||||
t_tokens = nullptr;
|
||||
t_logits = nullptr;
|
||||
t_embd = nullptr;
|
||||
t_embd_pooled = nullptr;
|
||||
|
||||
inputs.clear();
|
||||
|
||||
buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
|
||||
|
||||
ggml_init_params params = {
|
||||
/*.mem_size =*/ buf_compute_meta.size(),
|
||||
/*.mem_buffer =*/ buf_compute_meta.data(),
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
ctx_compute.reset(ggml_init(params));
|
||||
|
||||
gf = ggml_new_graph_custom(ctx_compute.get(), max_nodes, false);
|
||||
}
|
||||
|
||||
void set_inputs(const llama_ubatch * ubatch) override {
|
||||
for (auto & input : inputs) {
|
||||
@@ -403,6 +504,25 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
// try to update the existing graph result using the new graph parameters
|
||||
// this can only be done if we determine that the resulting graph using the new graph parameters
|
||||
// would be identical to the existing graph. in that case, we simply have to update the memory
|
||||
// contexts of the input tensors of the graph and we can reuse it for another computation
|
||||
// return true if the graph was updated and can be reused
|
||||
bool update(const llm_graph_params & params) override {
|
||||
if (!this->params.is_same(params)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool res = true;
|
||||
|
||||
for (auto & input : inputs) {
|
||||
res &= input->update(params);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
llm_graph_input_i * add_input(llm_graph_input_ptr input) {
|
||||
inputs.emplace_back(std::move(input));
|
||||
return inputs.back().get();
|
||||
@@ -415,37 +535,26 @@ public:
|
||||
ggml_tensor * t_embd_pooled = nullptr;
|
||||
|
||||
std::vector<llm_graph_input_ptr> inputs;
|
||||
|
||||
ggml_context_ptr ctx_compute;
|
||||
|
||||
// memory buffers used to evaluate the model
|
||||
std::vector<uint8_t> buf_compute_meta;
|
||||
|
||||
ggml_cgraph * gf;
|
||||
|
||||
int64_t max_nodes;
|
||||
|
||||
// keep a copy of the previous graph parameters
|
||||
// we will use this to determine whether the graph can be reused by comparing them with the new parameters
|
||||
// note: these are updated after constructing the new graph
|
||||
llm_graph_params params;
|
||||
};
|
||||
|
||||
//
|
||||
// llm_graph_context
|
||||
//
|
||||
|
||||
// callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
|
||||
using llm_graph_cb = std::function<void(const llama_ubatch & ubatch, ggml_tensor * cur, const char * name, int il)>;
|
||||
|
||||
struct llm_graph_params {
|
||||
ggml_context * ctx;
|
||||
|
||||
const llm_arch arch;
|
||||
|
||||
const llama_hparams & hparams;
|
||||
const llama_cparams & cparams;
|
||||
const llama_ubatch & ubatch;
|
||||
|
||||
ggml_backend_sched_t sched;
|
||||
ggml_backend_t backend_cpu;
|
||||
|
||||
const llama_adapter_cvec * cvec;
|
||||
const llama_adapter_loras * loras;
|
||||
const llama_memory_context_i * mctx;
|
||||
const llama_cross * cross;
|
||||
|
||||
uint32_t n_outputs;
|
||||
|
||||
const llm_graph_cb & cb;
|
||||
};
|
||||
|
||||
// used in build_rs to properly order writes and avoid unnecessary copies
|
||||
using llm_graph_get_rows_fn = std::function<ggml_tensor * (ggml_context *, ggml_tensor * states, ggml_tensor * ids)>;
|
||||
|
||||
@@ -485,8 +594,6 @@ struct llm_graph_context {
|
||||
const enum llama_pooling_type pooling_type;
|
||||
const enum llama_rope_type rope_type;
|
||||
|
||||
ggml_context * ctx0 = nullptr;
|
||||
|
||||
ggml_backend_sched_t sched;
|
||||
|
||||
ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
|
||||
@@ -498,7 +605,9 @@ struct llm_graph_context {
|
||||
|
||||
const llm_graph_cb & cb_func;
|
||||
|
||||
std::unique_ptr<llm_graph_result> res;
|
||||
llm_graph_result * res;
|
||||
|
||||
ggml_context * ctx0 = nullptr;
|
||||
|
||||
llm_graph_context(const llm_graph_params & params);
|
||||
virtual ~llm_graph_context() = default;
|
||||
|
||||
@@ -102,3 +102,12 @@ bool llama_hparams::is_swa(uint32_t il) const {
|
||||
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
bool llama_hparams::is_same(const llama_hparams & other) const {
|
||||
return
|
||||
n_ctx_train == other.n_ctx_train &&
|
||||
n_embd == other.n_embd &&
|
||||
n_layer == other.n_layer &&
|
||||
n_expert == other.n_expert &&
|
||||
n_expert_used == other.n_expert_used;
|
||||
}
|
||||
|
||||
@@ -202,6 +202,8 @@ struct llama_hparams {
|
||||
uint32_t n_pos_per_embd() const;
|
||||
|
||||
bool is_swa(uint32_t il) const;
|
||||
|
||||
bool is_same(const llama_hparams & other) const;
|
||||
};
|
||||
|
||||
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
||||
|
||||
@@ -68,6 +68,8 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
||||
|
||||
cells.resize(kv_size);
|
||||
|
||||
gf_res.reset(new llm_graph_result(32768)); // note: the max nodes will be updated later
|
||||
|
||||
for (uint32_t il = 0; il < n_layer_cache; il++) {
|
||||
if (filter && !filter(il)) {
|
||||
LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il);
|
||||
@@ -158,7 +160,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
||||
debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
|
||||
|
||||
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
|
||||
supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) : 0;
|
||||
supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 : 0;
|
||||
|
||||
if (!supports_set_rows) {
|
||||
LLAMA_LOG_WARN("%s: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility\n", __func__);
|
||||
@@ -480,14 +482,12 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d
|
||||
if (hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
|
||||
ggml_backend_sched_reset(sched);
|
||||
|
||||
auto * gf = lctx->graph_init();
|
||||
auto * res = gf_res.get();
|
||||
|
||||
auto res = build_graph_shift(lctx->get_cparams(), lctx->get_ctx_compute(), gf);
|
||||
if (!res) {
|
||||
LLAMA_LOG_ERROR("%s: failed to build graph for K-shift\n", __func__);
|
||||
return updated;
|
||||
}
|
||||
res->set_max_nodes(lctx->graph_max_nodes());
|
||||
res->reset();
|
||||
|
||||
auto * gf = build_graph_shift(res, lctx);
|
||||
if (!ggml_backend_sched_alloc_graph(sched, gf)) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate compute graph for K-shift\n", __func__);
|
||||
return updated;
|
||||
@@ -529,14 +529,12 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d
|
||||
|
||||
ggml_backend_sched_reset(sched);
|
||||
|
||||
auto * gf = lctx->graph_init();
|
||||
auto * res = gf_res.get();
|
||||
|
||||
auto res = build_graph_defrag(lctx->get_cparams(), lctx->get_ctx_compute(), gf, dinfo);
|
||||
if (!res) {
|
||||
LLAMA_LOG_ERROR("%s: failed to build graph for defrag\n", __func__);
|
||||
return updated;
|
||||
}
|
||||
res->set_max_nodes(lctx->graph_max_nodes());
|
||||
res->reset();
|
||||
|
||||
auto * gf = build_graph_defrag(res, lctx, dinfo);
|
||||
if (!ggml_backend_sched_alloc_graph(sched, gf)) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate compute graph for defrag\n", __func__);
|
||||
return updated;
|
||||
@@ -780,6 +778,10 @@ uint32_t llama_kv_cache_unified::get_n_kv() const {
|
||||
return std::min(cells.size(), std::max(n_pad, GGML_PAD(cells.used_max_p1(), n_pad)));
|
||||
}
|
||||
|
||||
bool llama_kv_cache_unified::get_supports_set_rows() const {
|
||||
return supports_set_rows;
|
||||
}
|
||||
|
||||
ggml_tensor * llama_kv_cache_unified::get_k(ggml_context * ctx, int32_t il, uint32_t n_kv) const {
|
||||
const int32_t ikv = map_layer_ids.at(il);
|
||||
|
||||
@@ -1142,11 +1144,9 @@ void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
|
||||
}
|
||||
}
|
||||
|
||||
llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
|
||||
const llama_cparams & cparams,
|
||||
ggml_context * ctx,
|
||||
ggml_cgraph * gf) const {
|
||||
auto res = std::make_unique<llm_graph_result>();
|
||||
ggml_cgraph * llama_kv_cache_unified::build_graph_shift(llm_graph_result * res, llama_context * lctx) const {
|
||||
auto * ctx = res->get_ctx();
|
||||
auto * gf = res->get_gf();
|
||||
|
||||
const auto & n_embd_head_k = hparams.n_embd_head_k;
|
||||
//const auto & n_embd_head_v = hparams.n_embd_head_v;
|
||||
@@ -1156,6 +1156,8 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
|
||||
inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, cells.size());
|
||||
ggml_set_input(inp->k_shift);
|
||||
|
||||
const auto & cparams = lctx->get_cparams();
|
||||
|
||||
for (const auto & layer : layers) {
|
||||
const uint32_t il = layer.il;
|
||||
|
||||
@@ -1181,18 +1183,20 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
|
||||
|
||||
res->add_input(std::move(inp));
|
||||
|
||||
return res;
|
||||
return gf;
|
||||
}
|
||||
|
||||
llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
|
||||
const llama_cparams & cparams,
|
||||
ggml_context * ctx,
|
||||
ggml_cgraph * gf,
|
||||
const defrag_info & dinfo) const {
|
||||
auto res = std::make_unique<llm_graph_result>();
|
||||
ggml_cgraph * llama_kv_cache_unified::build_graph_defrag(
|
||||
llm_graph_result * res,
|
||||
llama_context * lctx,
|
||||
const defrag_info & dinfo) const {
|
||||
auto * ctx = res->get_ctx();
|
||||
auto * gf = res->get_gf();
|
||||
|
||||
const auto & ids = dinfo.ids;
|
||||
|
||||
const auto & cparams = lctx->get_cparams();
|
||||
|
||||
#if 0
|
||||
// CPU defrag
|
||||
//
|
||||
@@ -1329,7 +1333,7 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
|
||||
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
|
||||
#endif
|
||||
|
||||
return res;
|
||||
return gf;
|
||||
}
|
||||
|
||||
llama_kv_cache_unified::defrag_info llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) const {
|
||||
@@ -1940,6 +1944,10 @@ uint32_t llama_kv_cache_unified_context::get_n_kv() const {
|
||||
return n_kv;
|
||||
}
|
||||
|
||||
bool llama_kv_cache_unified_context::get_supports_set_rows() const {
|
||||
return kv->get_supports_set_rows();
|
||||
}
|
||||
|
||||
ggml_tensor * llama_kv_cache_unified_context::get_k(ggml_context * ctx, int32_t il) const {
|
||||
return kv->get_k(ctx, il, n_kv);
|
||||
}
|
||||
|
||||
@@ -121,6 +121,9 @@ public:
|
||||
|
||||
uint32_t get_n_kv() const;
|
||||
|
||||
// TODO: temporary
|
||||
bool get_supports_set_rows() const;
|
||||
|
||||
// get views of the current state of the cache
|
||||
ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv) const;
|
||||
ggml_tensor * get_v(ggml_context * ctx, int32_t il, uint32_t n_kv) const;
|
||||
@@ -193,13 +196,15 @@ private:
|
||||
|
||||
// env: LLAMA_SET_ROWS (temporary)
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
|
||||
int supports_set_rows = false;
|
||||
bool supports_set_rows = false;
|
||||
|
||||
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||
|
||||
std::unique_ptr<llm_graph_result> gf_res;
|
||||
|
||||
llama_kv_cells_unified cells;
|
||||
|
||||
std::vector<kv_layer> layers;
|
||||
@@ -226,15 +231,13 @@ private:
|
||||
float freq_base,
|
||||
float freq_scale) const;
|
||||
|
||||
llm_graph_result_ptr build_graph_shift(
|
||||
const llama_cparams & cparams,
|
||||
ggml_context * ctx,
|
||||
ggml_cgraph * gf) const;
|
||||
ggml_cgraph * build_graph_shift(
|
||||
llm_graph_result * res,
|
||||
llama_context * lctx) const;
|
||||
|
||||
llm_graph_result_ptr build_graph_defrag(
|
||||
const llama_cparams & cparams,
|
||||
ggml_context * ctx,
|
||||
ggml_cgraph * gf,
|
||||
ggml_cgraph * build_graph_defrag(
|
||||
llm_graph_result * res,
|
||||
llama_context * lctx,
|
||||
const defrag_info & dinfo) const;
|
||||
|
||||
void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
|
||||
@@ -288,6 +291,9 @@ public:
|
||||
|
||||
uint32_t get_n_kv() const;
|
||||
|
||||
// TODO: temporary
|
||||
bool get_supports_set_rows() const;
|
||||
|
||||
// get views of the current state of the cache
|
||||
ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
|
||||
ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
|
||||
|
||||
@@ -14751,10 +14751,10 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
||||
return res;
|
||||
}
|
||||
|
||||
llm_graph_result_ptr llama_model::build_graph(
|
||||
const llm_graph_params & params,
|
||||
ggml_cgraph * gf,
|
||||
llm_graph_type type) const {
|
||||
ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
// TODO: temporary - will refactor this to keep the "gf" instance in the llm_graph_context and avoid passing it everywhere
|
||||
auto * gf = params.res->get_gf();
|
||||
|
||||
std::unique_ptr<llm_graph_context> llm;
|
||||
|
||||
switch (arch) {
|
||||
@@ -14961,7 +14961,7 @@ llm_graph_result_ptr llama_model::build_graph(
|
||||
} break;
|
||||
case LLM_ARCH_T5:
|
||||
{
|
||||
switch (type) {
|
||||
switch (params.gtype) {
|
||||
case LLM_GRAPH_TYPE_ENCODER:
|
||||
llm = std::make_unique<llm_build_t5_enc>(*this, params, gf);
|
||||
break;
|
||||
@@ -15047,7 +15047,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
||||
// add on pooling layer
|
||||
llm->build_pooling(gf, cls, cls_b, cls_out, cls_out_b);
|
||||
|
||||
return std::move(llm->res);
|
||||
// TODO: updating the graph parameters here is a little bit obscure - figure out something better
|
||||
llm->res->params = params;
|
||||
|
||||
return llm->res->get_gf();
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
@@ -436,10 +436,7 @@ struct llama_model {
|
||||
llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
|
||||
|
||||
// TODO: move this to new llm_arch_model_i interface
|
||||
llm_graph_result_ptr build_graph(
|
||||
const llm_graph_params & params,
|
||||
ggml_cgraph * gf,
|
||||
llm_graph_type type) const;
|
||||
ggml_cgraph * build_graph(const llm_graph_params & params) const;
|
||||
|
||||
private:
|
||||
struct impl;
|
||||
|
||||
@@ -261,6 +261,7 @@ struct cmd_params {
|
||||
std::vector<bool> use_mmap;
|
||||
std::vector<bool> embeddings;
|
||||
std::vector<bool> no_op_offload;
|
||||
std::vector<bool> graph_reuse;
|
||||
ggml_numa_strategy numa;
|
||||
int reps;
|
||||
ggml_sched_priority prio;
|
||||
@@ -298,6 +299,7 @@ static const cmd_params cmd_params_defaults = {
|
||||
/* use_mmap */ { true },
|
||||
/* embeddings */ { false },
|
||||
/* no_op_offload */ { false },
|
||||
/* graph_reuse */ { false },
|
||||
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
|
||||
/* reps */ 5,
|
||||
/* prio */ GGML_SCHED_PRIO_NORMAL,
|
||||
@@ -377,6 +379,7 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||
printf(" -ot --override-tensors <tensor name pattern>=<buffer type>;...\n");
|
||||
printf(" (default: disabled)\n");
|
||||
printf(" -nopo, --no-op-offload <0|1> (default: 0)\n");
|
||||
printf(" -gr, --graph-reuse <0|1> (default: 0)\n");
|
||||
printf("\n");
|
||||
printf(
|
||||
"Multiple values can be given for each parameter by separating them with ','\n"
|
||||
@@ -620,6 +623,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
}
|
||||
auto p = string_split<bool>(argv[i], split_delim);
|
||||
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
|
||||
} else if (arg == "-gr" || arg == "--graph-reuse") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = string_split<bool>(argv[i], split_delim);
|
||||
params.graph_reuse.insert(params.graph_reuse.end(), p.begin(), p.end());
|
||||
} else if (arg == "--numa") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -885,6 +895,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
if (params.no_op_offload.empty()) {
|
||||
params.no_op_offload = cmd_params_defaults.no_op_offload;
|
||||
}
|
||||
if (params.graph_reuse.empty()) {
|
||||
params.graph_reuse = cmd_params_defaults.graph_reuse;
|
||||
}
|
||||
if (params.n_threads.empty()) {
|
||||
params.n_threads = cmd_params_defaults.n_threads;
|
||||
}
|
||||
@@ -926,6 +939,7 @@ struct cmd_params_instance {
|
||||
bool use_mmap;
|
||||
bool embeddings;
|
||||
bool no_op_offload;
|
||||
bool graph_reuse;
|
||||
|
||||
llama_model_params to_llama_mparams() const {
|
||||
llama_model_params mparams = llama_model_default_params();
|
||||
@@ -998,6 +1012,7 @@ struct cmd_params_instance {
|
||||
cparams.embeddings = embeddings;
|
||||
cparams.op_offload = !no_op_offload;
|
||||
cparams.swa_full = false;
|
||||
cparams.graph_reuse = graph_reuse;
|
||||
|
||||
return cparams;
|
||||
}
|
||||
@@ -1018,6 +1033,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
for (const auto & mmp : params.use_mmap)
|
||||
for (const auto & embd : params.embeddings)
|
||||
for (const auto & nopo : params.no_op_offload)
|
||||
for (const auto & gr : params.graph_reuse)
|
||||
for (const auto & nb : params.n_batch)
|
||||
for (const auto & nub : params.n_ubatch)
|
||||
for (const auto & tk : params.type_k)
|
||||
@@ -1059,6 +1075,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .use_mmap = */ mmp,
|
||||
/* .embeddings = */ embd,
|
||||
/* .no_op_offload= */ nopo,
|
||||
/* .graph_reuse = */ gr,
|
||||
};
|
||||
instances.push_back(instance);
|
||||
}
|
||||
@@ -1092,6 +1109,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .use_mmap = */ mmp,
|
||||
/* .embeddings = */ embd,
|
||||
/* .no_op_offload= */ nopo,
|
||||
/* .graph_reuse = */ gr,
|
||||
};
|
||||
instances.push_back(instance);
|
||||
}
|
||||
@@ -1125,6 +1143,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .use_mmap = */ mmp,
|
||||
/* .embeddings = */ embd,
|
||||
/* .no_op_offload= */ nopo,
|
||||
/* .graph_reuse = */ gr,
|
||||
};
|
||||
instances.push_back(instance);
|
||||
}
|
||||
@@ -1162,6 +1181,7 @@ struct test {
|
||||
bool use_mmap;
|
||||
bool embeddings;
|
||||
bool no_op_offload;
|
||||
bool graph_reuse;
|
||||
int n_prompt;
|
||||
int n_gen;
|
||||
int n_depth;
|
||||
@@ -1197,6 +1217,7 @@ struct test {
|
||||
use_mmap = inst.use_mmap;
|
||||
embeddings = inst.embeddings;
|
||||
no_op_offload = inst.no_op_offload;
|
||||
graph_reuse = inst.graph_reuse;
|
||||
n_prompt = inst.n_prompt;
|
||||
n_gen = inst.n_gen;
|
||||
n_depth = inst.n_depth;
|
||||
@@ -1243,8 +1264,8 @@ struct test {
|
||||
"cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
|
||||
"split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides",
|
||||
"defrag_thold",
|
||||
"use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", "test_time",
|
||||
"avg_ns", "stddev_ns", "avg_ts", "stddev_ts",
|
||||
"use_mmap", "embeddings", "no_op_offload", "graph_reuse", "n_prompt", "n_gen", "n_depth",
|
||||
"test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts",
|
||||
};
|
||||
return fields;
|
||||
}
|
||||
@@ -1259,7 +1280,7 @@ struct test {
|
||||
return INT;
|
||||
}
|
||||
if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
|
||||
field == "use_mmap" || field == "embeddings") {
|
||||
field == "use_mmap" || field == "embeddings" || field == "graph_reuse") {
|
||||
return BOOL;
|
||||
}
|
||||
if (field == "avg_ts" || field == "stddev_ts" || field == "defrag_thold") {
|
||||
@@ -1333,6 +1354,7 @@ struct test {
|
||||
std::to_string(use_mmap),
|
||||
std::to_string(embeddings),
|
||||
std::to_string(no_op_offload),
|
||||
std::to_string(graph_reuse),
|
||||
std::to_string(n_prompt),
|
||||
std::to_string(n_gen),
|
||||
std::to_string(n_depth),
|
||||
@@ -1518,6 +1540,9 @@ struct markdown_printer : public printer {
|
||||
if (field == "no_op_offload") {
|
||||
return 4;
|
||||
}
|
||||
if (field == "graph_reuse") {
|
||||
return 4;
|
||||
}
|
||||
|
||||
int width = std::max((int) field.length(), 10);
|
||||
|
||||
@@ -1552,6 +1577,9 @@ struct markdown_printer : public printer {
|
||||
if (field == "no_op_offload") {
|
||||
return "nopo";
|
||||
}
|
||||
if (field == "graph_reuse") {
|
||||
return "gr";
|
||||
}
|
||||
if (field == "tensor_split") {
|
||||
return "ts";
|
||||
}
|
||||
@@ -1626,6 +1654,9 @@ struct markdown_printer : public printer {
|
||||
if (params.no_op_offload.size() > 1 || params.no_op_offload != cmd_params_defaults.no_op_offload) {
|
||||
fields.emplace_back("no_op_offload");
|
||||
}
|
||||
if (params.graph_reuse.size() > 1 || params.graph_reuse != cmd_params_defaults.graph_reuse) {
|
||||
fields.emplace_back("graph_reuse");
|
||||
}
|
||||
fields.emplace_back("test");
|
||||
fields.emplace_back("t/s");
|
||||
|
||||
|
||||
Reference in New Issue
Block a user