mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-16 16:27:32 +03:00
Compare commits
10 Commits
b8749
...
0cc4m/vulk
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0776a6a039 | ||
|
|
937a425600 | ||
|
|
5c177a1036 | ||
|
|
ccd8d4a6ce | ||
|
|
4374b5ab9a | ||
|
|
eebf21c3e9 | ||
|
|
08a4ba6f03 | ||
|
|
2204bcedc8 | ||
|
|
c0d100e0fc | ||
|
|
58deae173e |
@@ -1450,11 +1450,19 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
std::vector<int32_t> ids;
|
||||
std::vector<ggml_bitset_t> used_ids;
|
||||
|
||||
static bool vk_sched_sync_log = getenv("GGML_VK_SYNC_LOG") != nullptr;
|
||||
|
||||
for (int split_id = 0; split_id < sched->n_splits; split_id++) {
|
||||
struct ggml_backend_sched_split * split = &splits[split_id];
|
||||
int split_backend_id = split->backend_id;
|
||||
ggml_backend_t split_backend = sched->backends[split_backend_id];
|
||||
|
||||
if (vk_sched_sync_log) {
|
||||
fprintf(stderr, "[VK_SYNC sched] split %d/%d backend_id=%d name=%s n_inputs=%d n_nodes=%d\n",
|
||||
split_id, sched->n_splits, split_backend_id,
|
||||
ggml_backend_name(split_backend), split->n_inputs, split->graph.n_nodes);
|
||||
}
|
||||
|
||||
// copy the input tensors to the split backend
|
||||
for (int input_id = 0; input_id < split->n_inputs; input_id++) {
|
||||
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[input_id]);
|
||||
@@ -1464,16 +1472,28 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
if (input->flags & GGML_TENSOR_FLAG_INPUT) {
|
||||
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
|
||||
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
||||
if (vk_sched_sync_log) {
|
||||
fprintf(stderr, "[VK_SYNC sched] input %s: event_synchronize (INPUT flag)\n", input->name);
|
||||
}
|
||||
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
||||
} else {
|
||||
if (vk_sched_sync_log) {
|
||||
fprintf(stderr, "[VK_SYNC sched] input %s: backend_synchronize (INPUT flag, no event)\n", input->name);
|
||||
}
|
||||
ggml_backend_synchronize(split_backend);
|
||||
}
|
||||
ggml_backend_tensor_copy(input, input_cpy);
|
||||
} else {
|
||||
// wait for the split backend to finish using the input before overwriting it
|
||||
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
||||
if (vk_sched_sync_log) {
|
||||
fprintf(stderr, "[VK_SYNC sched] input %s: event_wait\n", input->name);
|
||||
}
|
||||
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
|
||||
} else {
|
||||
if (vk_sched_sync_log) {
|
||||
fprintf(stderr, "[VK_SYNC sched] input %s: backend_synchronize (no event)\n", input->name);
|
||||
}
|
||||
ggml_backend_synchronize(split_backend);
|
||||
}
|
||||
|
||||
@@ -1565,7 +1585,14 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
} else {
|
||||
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
|
||||
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
|
||||
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
|
||||
bool async_ok = split_backend->iface.cpy_tensor_async && split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy);
|
||||
if (vk_sched_sync_log) {
|
||||
fprintf(stderr, "[VK_SYNC sched] input %s: cpy_tensor_async=%s\n", input->name, async_ok ? "true" : "false");
|
||||
}
|
||||
if (!async_ok) {
|
||||
if (vk_sched_sync_log) {
|
||||
fprintf(stderr, "[VK_SYNC sched] input %s: fallback sync copy\n", input->name);
|
||||
}
|
||||
ggml_backend_synchronize(input_backend);
|
||||
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
||||
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
||||
@@ -1579,6 +1606,10 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
}
|
||||
|
||||
if (!sched->callback_eval) {
|
||||
if (vk_sched_sync_log) {
|
||||
fprintf(stderr, "[VK_SYNC sched] graph_compute_async on %s (%d nodes)\n",
|
||||
ggml_backend_name(split_backend), split->graph.n_nodes);
|
||||
}
|
||||
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
|
||||
if (ec != GGML_STATUS_SUCCESS) {
|
||||
return ec;
|
||||
@@ -1620,6 +1651,9 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
// record the event of this copy
|
||||
if (split->n_inputs > 0) {
|
||||
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
||||
if (vk_sched_sync_log) {
|
||||
fprintf(stderr, "[VK_SYNC sched] event_record on %s\n", ggml_backend_name(split_backend));
|
||||
}
|
||||
ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy], split_backend);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -115,6 +115,17 @@ static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; }
|
||||
#define VK_LOG_DEBUG(msg) ((void) 0)
|
||||
#endif // GGML_VULKAN_DEBUG
|
||||
|
||||
// Synchronization tracing for multi-GPU deadlock debugging.
|
||||
// Enable with GGML_VK_SYNC_LOG=1 environment variable.
|
||||
static bool vk_sync_log_enabled = false;
|
||||
#define VK_SYNC_LOG(dev_name, msg) do { \
|
||||
if (vk_sync_log_enabled) { \
|
||||
fprintf(stderr, "[VK_SYNC %s tid=%lu] %s\n", \
|
||||
(dev_name).c_str(), (unsigned long)std::hash<std::thread::id>{}(std::this_thread::get_id()) % 10000, \
|
||||
((std::ostringstream&)(std::ostringstream() << msg)).str().c_str()); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
struct ggml_backend_vk_context;
|
||||
|
||||
#define MAX_PARAMETER_COUNT 12
|
||||
@@ -191,6 +202,7 @@ struct vk_queue;
|
||||
|
||||
struct vk_command_buffer {
|
||||
vk::CommandBuffer buf;
|
||||
uint64_t use_counter = 0;
|
||||
bool in_use = false;
|
||||
};
|
||||
|
||||
@@ -254,7 +266,7 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
|
||||
class vk_memory_logger;
|
||||
class vk_perf_logger;
|
||||
static void ggml_vk_destroy_buffer(vk_buffer& buf);
|
||||
static void ggml_vk_synchronize(ggml_backend_vk_context * ctx);
|
||||
static void ggml_vk_synchronize(ggml_backend_vk_context * ctx, const char * caller = "unknown");
|
||||
|
||||
static constexpr uint32_t mul_mat_vec_max_cols = 8;
|
||||
static constexpr uint32_t p021_max_gqa_ratio = 8;
|
||||
@@ -938,21 +950,26 @@ struct vk_subbuffer {
|
||||
}
|
||||
};
|
||||
|
||||
// vk_event is used for the event-related backend interfaces. It uses 'event' for
|
||||
// event_wait and 'fence' for event_synchronize. Polling on an event for
|
||||
// event_synchronize wouldn't be sufficient to wait for command buffers to complete,
|
||||
// and would lead to validation errors.
|
||||
struct vk_event {
|
||||
vk::Event event;
|
||||
vk::Fence fence;
|
||||
vk_command_buffer* cmd_buffer = nullptr;
|
||||
};
|
||||
|
||||
struct vk_semaphore {
|
||||
vk::Semaphore s;
|
||||
uint64_t value;
|
||||
};
|
||||
|
||||
// vk_event is used for the event-related backend interfaces. It uses vk::Events for
|
||||
// event_wait and a timeline semaphore for event_synchronize. Polling on an event for
|
||||
// event_synchronize wouldn't be sufficient to wait for command buffers to complete,
|
||||
// and would lead to validation errors.
|
||||
struct vk_event {
|
||||
std::vector<vk::Event> events_free; // Events available for reuse
|
||||
std::vector<vk::Event> events_submitted; // Events that are fully submitted and can be reused on next synchronize
|
||||
vk::Event event;
|
||||
bool has_event;
|
||||
|
||||
vk_semaphore tl_semaphore;
|
||||
vk_command_buffer* cmd_buffer = nullptr;
|
||||
uint64_t cmd_buffer_use_counter = 0;
|
||||
};
|
||||
|
||||
struct vk_submission {
|
||||
vk_command_buffer* buffer = nullptr;
|
||||
std::vector<vk_semaphore> wait_semaphores;
|
||||
@@ -2319,7 +2336,7 @@ static vk_command_buffer* ggml_vk_create_cmd_buffer(vk_device& device, vk_comman
|
||||
vk::CommandBufferLevel::ePrimary,
|
||||
1);
|
||||
const std::vector<vk::CommandBuffer> cmd_buffers = device->device.allocateCommandBuffers(command_buffer_alloc_info);
|
||||
p.cmd_buffers.push_back({ cmd_buffers.front(), true });
|
||||
p.cmd_buffers.push_back({ cmd_buffers.front(), 0, true });
|
||||
return &p.cmd_buffers[p.cmd_buffers.size()-1];
|
||||
}
|
||||
|
||||
@@ -2788,6 +2805,15 @@ static void ggml_vk_sync_buffers(ggml_backend_vk_context* ctx, vk_context& subct
|
||||
);
|
||||
}
|
||||
|
||||
static void ggml_vk_reset_event(vk_context& ctx, vk::Event& event) {
|
||||
VK_LOG_DEBUG("ggml_vk_set_event()");
|
||||
|
||||
ctx->s->buffer->buf.resetEvent(
|
||||
event,
|
||||
ctx->p->q->stage_flags
|
||||
);
|
||||
}
|
||||
|
||||
static void ggml_vk_set_event(vk_context& ctx, vk::Event& event) {
|
||||
VK_LOG_DEBUG("ggml_vk_set_event()");
|
||||
|
||||
@@ -5731,6 +5757,7 @@ static void ggml_vk_instance_init() {
|
||||
vk_perf_logger_concurrent = getenv("GGML_VK_PERF_LOGGER_CONCURRENT") != nullptr;
|
||||
vk_enable_sync_logger = getenv("GGML_VK_SYNC_LOGGER") != nullptr;
|
||||
vk_memory_logger_enabled = getenv("GGML_VK_MEMORY_LOGGER") != nullptr;
|
||||
vk_sync_log_enabled = getenv("GGML_VK_SYNC_LOG") != nullptr;
|
||||
const char* GGML_VK_PIPELINE_STATS = getenv("GGML_VK_PIPELINE_STATS");
|
||||
if (GGML_VK_PIPELINE_STATS != nullptr) {
|
||||
vk_pipeline_stats_filter = GGML_VK_PIPELINE_STATS;
|
||||
@@ -6392,6 +6419,7 @@ static vk_subbuffer ggml_vk_tensor_subbuffer(
|
||||
static vk_command_buffer* ggml_vk_get_or_create_cmd_buffer(vk_device& device, vk_command_pool& pool) {
|
||||
for (auto& cmd_buffer : pool.cmd_buffers) {
|
||||
if (!cmd_buffer.in_use) {
|
||||
cmd_buffer.use_counter++;
|
||||
cmd_buffer.in_use = true;
|
||||
return &cmd_buffer;
|
||||
}
|
||||
@@ -6495,11 +6523,13 @@ static void ggml_vk_ctx_begin(vk_device& device, vk_context& subctx) {
|
||||
subctx->s = subctx->seqs[subctx->seqs.size() - 1].data();
|
||||
}
|
||||
|
||||
static vk_context ggml_vk_get_compute_ctx(ggml_backend_vk_context * ctx) {
|
||||
static vk_context ggml_vk_get_compute_ctx(ggml_backend_vk_context * ctx, const char * caller = "unknown") {
|
||||
if (!ctx->compute_ctx.expired()) {
|
||||
return ctx->compute_ctx.lock();
|
||||
}
|
||||
|
||||
VK_SYNC_LOG(ctx->name, "get_compute_ctx: CREATING NEW compute_ctx, caller=" << caller);
|
||||
|
||||
vk_context result = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||
|
||||
ctx->compute_ctx = result;
|
||||
@@ -12652,7 +12682,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx, vk_contex
|
||||
ggml_vk_ctx_end(subctx);
|
||||
ggml_vk_submit(subctx, {});
|
||||
ctx->submit_pending = true;
|
||||
ggml_vk_synchronize(ctx);
|
||||
ggml_vk_synchronize(ctx, "preallocate_buffers");
|
||||
GGML_ASSERT(ctx->compute_ctx.expired());
|
||||
ggml_vk_ctx_begin(ctx->device, subctx);
|
||||
ctx->compute_ctx = subctx;
|
||||
@@ -12729,7 +12759,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||
}
|
||||
}
|
||||
|
||||
vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx, "build_graph");
|
||||
|
||||
{
|
||||
// This logic detects dependencies between modes in the graph and calls ggml_vk_sync_buffers
|
||||
@@ -13228,7 +13258,7 @@ static void ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph *
|
||||
ctx->submit_pending = true;
|
||||
|
||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||
ggml_vk_synchronize(ctx);
|
||||
ggml_vk_synchronize(ctx, "check_results");
|
||||
ggml_vk_check_results_1(ctx, cgraph, tensor_idx);
|
||||
#endif
|
||||
}
|
||||
@@ -13287,7 +13317,7 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
|
||||
// discard any unsubmitted command buffers
|
||||
ctx->compute_ctx.reset();
|
||||
// wait for any pending command buffers to finish
|
||||
ggml_vk_synchronize(ctx);
|
||||
ggml_vk_synchronize(ctx, "cleanup");
|
||||
|
||||
ggml_vk_graph_cleanup(ctx);
|
||||
|
||||
@@ -13606,6 +13636,7 @@ static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_type(ggml_b
|
||||
static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
VK_LOG_DEBUG("ggml_backend_vk_set_tensor_async(" << size << ")");
|
||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||
VK_SYNC_LOG(ctx->name, "set_tensor_async: tensor=" << tensor->name << " size=" << size);
|
||||
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
||||
|
||||
if (size == 0) {
|
||||
@@ -13626,7 +13657,7 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor
|
||||
cpy_ctx = ctx->transfer_ctx.lock();
|
||||
}
|
||||
} else {
|
||||
cpy_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
cpy_ctx = ggml_vk_get_compute_ctx(ctx, "set_tensor_async");
|
||||
}
|
||||
|
||||
vk_buffer buf = buf_ctx->dev_buffer;
|
||||
@@ -13646,13 +13677,14 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor
|
||||
|
||||
cpy_ctx->s->buffer->buf.copyBuffer(ctx->sync_staging->buffer, buf->buffer, { buffer_cpy });
|
||||
deferred_memcpy(ctx->sync_staging->ptr, data, size, &cpy_ctx->in_memcpys);
|
||||
ggml_vk_synchronize(ctx);
|
||||
ggml_vk_synchronize(ctx, "set_tensor_async_staging");
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
VK_LOG_DEBUG("ggml_backend_vk_get_tensor_async(" << size << ")");
|
||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||
VK_SYNC_LOG(ctx->name, "get_tensor_async: tensor=" << tensor->name << " size=" << size);
|
||||
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
||||
|
||||
if (size == 0) {
|
||||
@@ -13661,7 +13693,7 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_
|
||||
|
||||
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;
|
||||
|
||||
vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx, "get_tensor_async");
|
||||
|
||||
vk_buffer buf = buf_ctx->dev_buffer;
|
||||
|
||||
@@ -13680,13 +13712,17 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_
|
||||
|
||||
compute_ctx->s->buffer->buf.copyBuffer(buf->buffer, ctx->sync_staging->buffer, { buffer_cpy });
|
||||
deferred_memcpy(data, ctx->sync_staging->ptr, size, &compute_ctx->out_memcpys);
|
||||
ggml_vk_synchronize(ctx);
|
||||
ggml_vk_synchronize(ctx, "get_tensor_async_staging");
|
||||
}
|
||||
}
|
||||
|
||||
static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
|
||||
VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async(" << src << " -> " << dst << ", size=" << ggml_nbytes(src) << ")");
|
||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend_dst->context;
|
||||
VK_SYNC_LOG(ctx->name, "cpy_tensor_async: BEGIN " << src->name << " -> " << dst->name
|
||||
<< " size=" << ggml_nbytes(src)
|
||||
<< " src_is_vk=" << ggml_backend_buffer_is_vk(src->buffer)
|
||||
<< " src_is_host=" << ggml_backend_buffer_is_host(src->buffer));
|
||||
|
||||
// Skip zero-size tensors
|
||||
if (ggml_nbytes(src) == 0) {
|
||||
@@ -13708,7 +13744,7 @@ static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend_src, ggml_ba
|
||||
return false;
|
||||
}
|
||||
|
||||
vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx, "cpy_tensor_async_vk2vk");
|
||||
|
||||
ggml_vk_buffer_copy_async(compute_ctx, dst_buf, vk_tensor_offset(dst) + dst->view_offs,
|
||||
src_buf_ctx->dev_buffer, vk_tensor_offset(src) + src->view_offs,
|
||||
@@ -13721,6 +13757,7 @@ static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend_src, ggml_ba
|
||||
size_t pinned_offset = 0;
|
||||
ggml_vk_host_get(ctx->device, src->data, pinned_buf, pinned_offset);
|
||||
if (pinned_buf == nullptr) {
|
||||
VK_SYNC_LOG(ctx->name, "cpy_tensor_async: host src not pinned, returning false");
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -13730,27 +13767,34 @@ static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend_src, ggml_ba
|
||||
cpy_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
|
||||
ctx->transfer_ctx = cpy_ctx;
|
||||
ggml_vk_ctx_begin(ctx->device, cpy_ctx);
|
||||
VK_SYNC_LOG(ctx->name, "cpy_tensor_async: created new transfer_ctx");
|
||||
} else {
|
||||
cpy_ctx = ctx->transfer_ctx.lock();
|
||||
}
|
||||
VK_SYNC_LOG(ctx->name, "cpy_tensor_async: using transfer queue for host->dev copy");
|
||||
} else {
|
||||
cpy_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
cpy_ctx = ggml_vk_get_compute_ctx(ctx, "cpy_tensor_async_host2dev");
|
||||
VK_SYNC_LOG(ctx->name, "cpy_tensor_async: using compute queue for host->dev copy");
|
||||
}
|
||||
|
||||
return ggml_vk_buffer_write_async(cpy_ctx, dst_buf,
|
||||
bool ret = ggml_vk_buffer_write_async(cpy_ctx, dst_buf,
|
||||
vk_tensor_offset(dst) + dst->view_offs,
|
||||
src->data, ggml_nbytes(src));
|
||||
VK_SYNC_LOG(ctx->name, "cpy_tensor_async: host->dev write_async returned " << ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
GGML_UNUSED(backend_src);
|
||||
return false;
|
||||
}
|
||||
|
||||
static void ggml_vk_synchronize(ggml_backend_vk_context * ctx) {
|
||||
static void ggml_vk_synchronize(ggml_backend_vk_context * ctx, const char * caller) {
|
||||
VK_LOG_DEBUG("ggml_vk_synchronize()");
|
||||
|
||||
bool do_transfer = !ctx->compute_ctx.expired();
|
||||
|
||||
VK_SYNC_LOG(ctx->name, "vk_synchronize: do_transfer=" << do_transfer << " submit_pending=" << ctx->submit_pending << " caller=" << caller);
|
||||
|
||||
if (ggml_vk_submit_transfer_ctx(ctx)) {
|
||||
ctx->submit_pending = true;
|
||||
}
|
||||
@@ -13774,6 +13818,7 @@ static void ggml_vk_synchronize(ggml_backend_vk_context * ctx) {
|
||||
}
|
||||
|
||||
if (ctx->submit_pending) {
|
||||
VK_SYNC_LOG(ctx->name, "vk_synchronize: waiting for fence...");
|
||||
if (ctx->device->async_use_transfer_queue && ctx->transfer_semaphore_last_submitted < ctx->transfer_semaphore.value) {
|
||||
vk::TimelineSemaphoreSubmitInfo tl_info{
|
||||
1, &ctx->transfer_semaphore.value,
|
||||
@@ -13797,6 +13842,7 @@ static void ggml_vk_synchronize(ggml_backend_vk_context * ctx) {
|
||||
ctx->submit_pending = false;
|
||||
if (cmd_buf) {
|
||||
cmd_buf->in_use = false;
|
||||
cmd_buf->buf.reset();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13812,9 +13858,16 @@ static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
|
||||
VK_LOG_DEBUG("ggml_backend_vk_synchronize()");
|
||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||
|
||||
ggml_vk_synchronize(ctx);
|
||||
VK_SYNC_LOG(ctx->name, "synchronize: BEGIN submit_pending=" << ctx->submit_pending
|
||||
<< " compute_ctx_alive=" << !ctx->compute_ctx.expired());
|
||||
|
||||
ggml_vk_synchronize(ctx, "backend_synchronize");
|
||||
|
||||
VK_SYNC_LOG(ctx->name, "synchronize: fence done, calling graph_cleanup");
|
||||
|
||||
ggml_vk_graph_cleanup(ctx);
|
||||
|
||||
VK_SYNC_LOG(ctx->name, "synchronize: DONE");
|
||||
}
|
||||
|
||||
static bool ggml_vk_is_empty(ggml_tensor * node) {
|
||||
@@ -14233,6 +14286,10 @@ static int32_t find_first_set(uint32_t x) {
|
||||
static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
|
||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||
VK_SYNC_LOG(ctx->name, "graph_compute: BEGIN nodes=" << cgraph->n_nodes
|
||||
<< " compute_ctx_alive=" << !ctx->compute_ctx.expired()
|
||||
<< " transfer_ctx_alive=" << !ctx->transfer_ctx.expired()
|
||||
<< " submit_pending=" << ctx->submit_pending);
|
||||
|
||||
if (vk_instance.debug_utils_support) {
|
||||
vk::DebugUtilsLabelEXT dul = {};
|
||||
@@ -14285,7 +14342,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
std::fill(ctx->query_node_idx.begin(), ctx->query_node_idx.end(), 0);
|
||||
|
||||
GGML_ASSERT(ctx->compute_ctx.expired());
|
||||
compute_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
compute_ctx = ggml_vk_get_compute_ctx(ctx, "graph_compute_perf");
|
||||
ctx->query_idx = 0;
|
||||
compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
|
||||
}
|
||||
@@ -14295,7 +14352,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
|
||||
if (ctx->prealloc_size_add_rms_partials) {
|
||||
ggml_vk_preallocate_buffers(ctx, nullptr);
|
||||
compute_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
compute_ctx = ggml_vk_get_compute_ctx(ctx, "graph_compute_rms_partials");
|
||||
// initialize partial sums to zero.
|
||||
ggml_vk_buffer_memset_async(compute_ctx, ctx->prealloc_add_rms_partials, 0, 0, ctx->prealloc_size_add_rms_partials);
|
||||
ggml_vk_sync_buffers(ctx, compute_ctx);
|
||||
@@ -14518,7 +14575,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
bool enqueued = ggml_vk_build_graph(ctx, cgraph, i, cgraph->nodes[submit_node_idx], submit_node_idx, i + ctx->num_additional_fused_ops >= last_node, almost_ready, submit);
|
||||
|
||||
if (vk_perf_logger_enabled && enqueued) {
|
||||
compute_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
compute_ctx = ggml_vk_get_compute_ctx(ctx, "graph_compute_perf_timestamp");
|
||||
if (!vk_perf_logger_concurrent) {
|
||||
// track a single node/fusion for the current query
|
||||
ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i];
|
||||
@@ -14601,9 +14658,13 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
}
|
||||
|
||||
if (!ctx->device->support_async) {
|
||||
ggml_vk_synchronize(ctx);
|
||||
VK_SYNC_LOG(ctx->name, "graph_compute: sync (support_async=false)");
|
||||
ggml_vk_synchronize(ctx, "graph_compute_no_async");
|
||||
}
|
||||
|
||||
VK_SYNC_LOG(ctx->name, "graph_compute: DONE submit_pending=" << ctx->submit_pending
|
||||
<< " compute_ctx_alive=" << !ctx->compute_ctx.expired());
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
|
||||
UNUSED(backend);
|
||||
@@ -14853,24 +14914,45 @@ static void ggml_backend_vk_event_record(ggml_backend_t backend, ggml_backend_ev
|
||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||
vk_event *vkev = (vk_event *)event->context;
|
||||
|
||||
VK_SYNC_LOG(ctx->name, "event_record: BEGIN event=" << event
|
||||
<< " compute_ctx_alive=" << !ctx->compute_ctx.expired()
|
||||
<< " submit_pending=" << ctx->submit_pending);
|
||||
|
||||
ggml_vk_submit_transfer_ctx(ctx);
|
||||
|
||||
vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx, "event_record");
|
||||
auto* cmd_buf = compute_ctx->s->buffer; // retrieve pointer before it gets reset
|
||||
|
||||
// the backend interface doesn't have an explicit reset, so reset it here
|
||||
// before we record the command to set it
|
||||
ctx->device->device.resetEvent(vkev->event);
|
||||
ctx->device->device.resetFences({ vkev->fence });
|
||||
if (vkev->has_event) {
|
||||
// Move existing event into submitted
|
||||
vkev->events_submitted.push_back(vkev->event);
|
||||
}
|
||||
|
||||
// Grab the next event and record it, create one if necessary
|
||||
if (vkev->events_free.empty()) {
|
||||
VK_SYNC_LOG(ctx->name, "event_record: create new event");
|
||||
vkev->event = ctx->device->device.createEvent({});
|
||||
} else {
|
||||
vkev->event = vkev->events_free.back();
|
||||
vkev->events_free.pop_back();
|
||||
}
|
||||
|
||||
vkev->has_event = true;
|
||||
|
||||
ggml_vk_set_event(compute_ctx, vkev->event);
|
||||
|
||||
vkev->tl_semaphore.value++;
|
||||
compute_ctx->s->signal_semaphores.push_back(vkev->tl_semaphore);
|
||||
ggml_vk_ctx_end(compute_ctx);
|
||||
|
||||
ggml_vk_submit(compute_ctx, {vkev->fence});
|
||||
VK_SYNC_LOG(ctx->name, "event_record: set event, submit cmd_buf=" << cmd_buf->buf << ", vk_event=" << (VkEvent)vkev->event);
|
||||
ggml_vk_submit(compute_ctx, {});
|
||||
ctx->submit_pending = true;
|
||||
vkev->cmd_buffer = cmd_buf;
|
||||
vkev->cmd_buffer_use_counter = cmd_buf->use_counter;
|
||||
ctx->compute_ctx.reset();
|
||||
|
||||
VK_SYNC_LOG(ctx->name, "event_record: DONE event=" << event);
|
||||
}
|
||||
|
||||
static void ggml_backend_vk_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
|
||||
@@ -14878,11 +14960,16 @@ static void ggml_backend_vk_event_wait(ggml_backend_t backend, ggml_backend_even
|
||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||
vk_event *vkev = (vk_event *)event->context;
|
||||
|
||||
vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx, "event_wait");
|
||||
|
||||
ggml_vk_wait_events(compute_ctx, {vkev->event});
|
||||
ggml_vk_ctx_end(compute_ctx);
|
||||
ctx->compute_ctx.reset();
|
||||
if (vkev->has_event) {
|
||||
// Wait for latest event
|
||||
VK_SYNC_LOG(ctx->name, "event_wait: recording vkCmdWaitEvents, event=" << event
|
||||
<< " vk_event=" << (VkEvent)vkev->event
|
||||
<< " compute_ctx_alive=" << !ctx->compute_ctx.expired());
|
||||
ggml_vk_wait_events(compute_ctx, { vkev->event });
|
||||
VK_SYNC_LOG(ctx->name, "event_wait: DONE (cmd recorded, not yet submitted)");
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: enable async and synchronize
|
||||
@@ -15672,10 +15759,13 @@ static ggml_backend_event_t ggml_backend_vk_device_event_new(ggml_backend_dev_t
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// The event/fence is expected to initially be in the signaled state.
|
||||
vkev->event = device->device.createEvent({});
|
||||
vkev->fence = device->device.createFence({vk::FenceCreateFlagBits::eSignaled});
|
||||
device->device.setEvent(vkev->event);
|
||||
// No events initially, they get created on demand
|
||||
vkev->has_event = false;
|
||||
|
||||
vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 };
|
||||
vk::SemaphoreCreateInfo ci{};
|
||||
ci.setPNext(&tci);
|
||||
vkev->tl_semaphore = { device->device.createSemaphore(ci), 0 };
|
||||
|
||||
return new ggml_backend_event {
|
||||
/* .device = */ dev,
|
||||
@@ -15689,8 +15779,16 @@ static void ggml_backend_vk_device_event_free(ggml_backend_dev_t dev, ggml_backe
|
||||
|
||||
vk_event *vkev = (vk_event *)event->context;
|
||||
|
||||
device->device.destroyFence(vkev->fence);
|
||||
device->device.destroyEvent(vkev->event);
|
||||
device->device.destroySemaphore(vkev->tl_semaphore.s);
|
||||
for (auto& event : vkev->events_free) {
|
||||
device->device.destroyEvent(event);
|
||||
}
|
||||
for (auto& event : vkev->events_submitted) {
|
||||
device->device.destroyEvent(event);
|
||||
}
|
||||
if (vkev->has_event) {
|
||||
device->device.destroyEvent(vkev->event);
|
||||
}
|
||||
delete vkev;
|
||||
delete event;
|
||||
}
|
||||
@@ -15701,10 +15799,33 @@ static void ggml_backend_vk_device_event_synchronize(ggml_backend_dev_t dev, ggm
|
||||
auto device = ggml_vk_get_device(ctx->device);
|
||||
vk_event *vkev = (vk_event *)event->context;
|
||||
|
||||
VK_CHECK(device->device.waitForFences({ vkev->fence }, true, UINT64_MAX), "event_synchronize");
|
||||
// Finished using current command buffer so we flag for reuse
|
||||
if (vkev->cmd_buffer) {
|
||||
vkev->cmd_buffer->in_use = false;
|
||||
// Only do something if the event has actually been used
|
||||
if (vkev->has_event) {
|
||||
VK_SYNC_LOG(device->name, "event_synchronize: BEGIN waiting on timeline semaphore, event=" << event);
|
||||
vk::Semaphore sem = vkev->tl_semaphore.s;
|
||||
uint64_t val = vkev->tl_semaphore.value;
|
||||
vk::SemaphoreWaitInfo swi{vk::SemaphoreWaitFlags{}, sem, val};
|
||||
VK_CHECK(device->device.waitSemaphores(swi, UINT64_MAX), "event_synchronize");
|
||||
|
||||
VK_SYNC_LOG(device->name, "event_synchronize: timeline semaphore wait done, event=" << event);
|
||||
|
||||
// Reset and move submitted events
|
||||
for (auto& event : vkev->events_submitted) {
|
||||
VK_SYNC_LOG(device->name, "event_synchronize: resetting vkevent=" << event);
|
||||
device->device.resetEvent(event);
|
||||
}
|
||||
vkev->events_free.insert(vkev->events_free.end(), vkev->events_submitted.begin(), vkev->events_submitted.end());
|
||||
vkev->events_submitted.clear();
|
||||
|
||||
// Finished using current command buffer so we flag for reuse
|
||||
if (vkev->cmd_buffer) {
|
||||
// Only flag for reuse if it hasn't been reused already
|
||||
if (vkev->cmd_buffer_use_counter == vkev->cmd_buffer->use_counter) {
|
||||
vkev->cmd_buffer->in_use = false;
|
||||
vkev->cmd_buffer->buf.reset();
|
||||
}
|
||||
vkev->cmd_buffer = nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user