mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-30 16:47:31 +03:00
Compare commits
18 Commits
b8317
...
0cc4m/vulk
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0776a6a039 | ||
|
|
937a425600 | ||
|
|
5c177a1036 | ||
|
|
ccd8d4a6ce | ||
|
|
4374b5ab9a | ||
|
|
eebf21c3e9 | ||
|
|
08a4ba6f03 | ||
|
|
2204bcedc8 | ||
|
|
c0d100e0fc | ||
|
|
58deae173e | ||
|
|
73c9eb8ced | ||
|
|
983df142a9 | ||
|
|
57819b8d4b | ||
|
|
557fe2d913 | ||
|
|
0e810413bb | ||
|
|
128142fe7d | ||
|
|
6de1bc631d | ||
|
|
0a10c34dc1 |
@@ -732,23 +732,28 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
|
||||
"llama-completion",
|
||||
"llama-convert-llama2c-to-ggml",
|
||||
"llama-cvector-generator",
|
||||
"llama-debug",
|
||||
"llama-diffusion-cli",
|
||||
"llama-embedding",
|
||||
"llama-eval-callback",
|
||||
"llama-export-lora",
|
||||
"llama-finetune",
|
||||
"llama-fit-params",
|
||||
"llama-gemma3-cli",
|
||||
"llama-gen-docs",
|
||||
"llama-gguf",
|
||||
"llama-gguf-hash",
|
||||
"llama-gguf-split",
|
||||
"llama-gritlm",
|
||||
"llama-idle",
|
||||
"llama-imatrix",
|
||||
"llama-infill",
|
||||
"llama-mtmd-cli",
|
||||
"llama-llava-clip-quantize-cli",
|
||||
"llama-llava-cli",
|
||||
"llama-lookahead",
|
||||
"llama-lookup",
|
||||
"llama-lookup-create",
|
||||
"llama-lookup-merge",
|
||||
"llama-lookup-stats",
|
||||
"llama-minicpmv-cli",
|
||||
"llama-mtmd-cli",
|
||||
"llama-parallel",
|
||||
"llama-passkey",
|
||||
"llama-perplexity",
|
||||
@@ -2666,7 +2671,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.out_file = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_RESULTS}));
|
||||
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE,
|
||||
LLAMA_EXAMPLE_RESULTS, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS}));
|
||||
add_opt(common_arg(
|
||||
{"-ofreq", "--output-frequency"}, "N",
|
||||
string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
|
||||
|
||||
@@ -105,6 +105,7 @@ enum llama_example {
|
||||
LLAMA_EXAMPLE_FINETUNE,
|
||||
LLAMA_EXAMPLE_FIT_PARAMS,
|
||||
LLAMA_EXAMPLE_RESULTS,
|
||||
LLAMA_EXAMPLE_EXPORT_GRAPH_OPS,
|
||||
|
||||
LLAMA_EXAMPLE_COUNT,
|
||||
};
|
||||
|
||||
@@ -2194,6 +2194,8 @@ class GPTNeoXModel(TextModel):
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
||||
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
||||
assert n_head is not None
|
||||
assert n_embed is not None
|
||||
|
||||
if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name):
|
||||
# Map bloom-style qkv_linear to gpt-style qkv_linear
|
||||
@@ -2231,6 +2233,8 @@ class BloomModel(TextModel):
|
||||
def set_gguf_parameters(self):
|
||||
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
||||
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
||||
assert n_head is not None
|
||||
assert n_embed is not None
|
||||
self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
|
||||
self.gguf_writer.add_embedding_length(n_embed)
|
||||
self.gguf_writer.add_feed_forward_length(4 * n_embed)
|
||||
@@ -2243,6 +2247,8 @@ class BloomModel(TextModel):
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
||||
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
||||
assert n_head is not None
|
||||
assert n_embed is not None
|
||||
|
||||
name = re.sub(r'transformer\.', '', name)
|
||||
|
||||
@@ -3853,6 +3859,7 @@ class LLaDAModel(TextModel):
|
||||
|
||||
if (rope_dim := hparams.get("head_dim")) is None:
|
||||
n_heads = hparams.get("num_attention_heads", hparams.get("n_heads"))
|
||||
assert n_heads is not None
|
||||
rope_dim = hparams.get("hidden_size", hparams.get("d_model")) // n_heads
|
||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||
|
||||
@@ -3884,6 +3891,7 @@ class LLaDAModel(TextModel):
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
n_head = self.hparams.get("num_attention_heads", self.hparams.get("n_heads"))
|
||||
assert n_head is not None
|
||||
n_kv_head = self.hparams.get("num_key_value_heads", self.hparams.get("n_kv_heads"))
|
||||
|
||||
if self.undo_permute:
|
||||
@@ -9485,7 +9493,9 @@ class ChatGLMModel(TextModel):
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
||||
assert n_embed is not None
|
||||
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
||||
assert n_head is not None
|
||||
n_head_kv = self.hparams.get("multi_query_group_num", self.hparams.get("num_key_value_heads", n_head))
|
||||
self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
|
||||
self.gguf_writer.add_embedding_length(n_embed)
|
||||
|
||||
@@ -1450,13 +1450,17 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
std::vector<int32_t> ids;
|
||||
std::vector<ggml_bitset_t> used_ids;
|
||||
|
||||
static bool vk_sched_sync_log = getenv("GGML_VK_SYNC_LOG") != nullptr;
|
||||
|
||||
for (int split_id = 0; split_id < sched->n_splits; split_id++) {
|
||||
struct ggml_backend_sched_split * split = &splits[split_id];
|
||||
int split_backend_id = split->backend_id;
|
||||
ggml_backend_t split_backend = sched->backends[split_backend_id];
|
||||
|
||||
if (sched->events[split_backend_id][sched->cur_copy] == NULL) {
|
||||
ggml_backend_synchronize(split_backend);
|
||||
if (vk_sched_sync_log) {
|
||||
fprintf(stderr, "[VK_SYNC sched] split %d/%d backend_id=%d name=%s n_inputs=%d n_nodes=%d\n",
|
||||
split_id, sched->n_splits, split_backend_id,
|
||||
ggml_backend_name(split_backend), split->n_inputs, split->graph.n_nodes);
|
||||
}
|
||||
|
||||
// copy the input tensors to the split backend
|
||||
@@ -1468,13 +1472,29 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
if (input->flags & GGML_TENSOR_FLAG_INPUT) {
|
||||
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
|
||||
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
||||
if (vk_sched_sync_log) {
|
||||
fprintf(stderr, "[VK_SYNC sched] input %s: event_synchronize (INPUT flag)\n", input->name);
|
||||
}
|
||||
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
||||
} else {
|
||||
if (vk_sched_sync_log) {
|
||||
fprintf(stderr, "[VK_SYNC sched] input %s: backend_synchronize (INPUT flag, no event)\n", input->name);
|
||||
}
|
||||
ggml_backend_synchronize(split_backend);
|
||||
}
|
||||
ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
|
||||
ggml_backend_tensor_copy(input, input_cpy);
|
||||
} else {
|
||||
// wait for the split backend to finish using the input before overwriting it
|
||||
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
||||
if (vk_sched_sync_log) {
|
||||
fprintf(stderr, "[VK_SYNC sched] input %s: event_wait\n", input->name);
|
||||
}
|
||||
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
|
||||
} else {
|
||||
if (vk_sched_sync_log) {
|
||||
fprintf(stderr, "[VK_SYNC sched] input %s: backend_synchronize (no event)\n", input->name);
|
||||
}
|
||||
ggml_backend_synchronize(split_backend);
|
||||
}
|
||||
|
||||
// when offloading MoE weights, we can reduce the amount of data copied by copying only the experts that are used
|
||||
@@ -1565,7 +1585,14 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
} else {
|
||||
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
|
||||
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
|
||||
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
|
||||
bool async_ok = split_backend->iface.cpy_tensor_async && split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy);
|
||||
if (vk_sched_sync_log) {
|
||||
fprintf(stderr, "[VK_SYNC sched] input %s: cpy_tensor_async=%s\n", input->name, async_ok ? "true" : "false");
|
||||
}
|
||||
if (!async_ok) {
|
||||
if (vk_sched_sync_log) {
|
||||
fprintf(stderr, "[VK_SYNC sched] input %s: fallback sync copy\n", input->name);
|
||||
}
|
||||
ggml_backend_synchronize(input_backend);
|
||||
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
||||
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
||||
@@ -1578,11 +1605,11 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
}
|
||||
}
|
||||
|
||||
if (sched->events[split_backend_id][sched->cur_copy] == NULL) {
|
||||
ggml_backend_synchronize(split_backend);
|
||||
}
|
||||
|
||||
if (!sched->callback_eval) {
|
||||
if (vk_sched_sync_log) {
|
||||
fprintf(stderr, "[VK_SYNC sched] graph_compute_async on %s (%d nodes)\n",
|
||||
ggml_backend_name(split_backend), split->graph.n_nodes);
|
||||
}
|
||||
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
|
||||
if (ec != GGML_STATUS_SUCCESS) {
|
||||
return ec;
|
||||
@@ -1624,6 +1651,9 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
// record the event of this copy
|
||||
if (split->n_inputs > 0) {
|
||||
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
||||
if (vk_sched_sync_log) {
|
||||
fprintf(stderr, "[VK_SYNC sched] event_record on %s\n", ggml_backend_name(split_backend));
|
||||
}
|
||||
ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy], split_backend);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2823,14 +2823,11 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
|
||||
ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
|
||||
ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
||||
|
||||
//enables async copies from CPU to CUDA, instead of only CUDA-to-CUDA
|
||||
bool copy_from_host = ggml_backend_buffer_is_host(buf_src) && ggml_backend_dev_type(backend_src->device) == GGML_BACKEND_DEVICE_TYPE_CPU;
|
||||
|
||||
if (!(copy_from_host || ggml_backend_is_cuda(backend_src)) || !ggml_backend_is_cuda(backend_dst)) {
|
||||
if (!ggml_backend_is_cuda(backend_src) || !ggml_backend_is_cuda(backend_dst)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!(copy_from_host || ggml_backend_buffer_is_cuda(buf_src)) || !ggml_backend_buffer_is_cuda(dst->buffer)) {
|
||||
if (!ggml_backend_buffer_is_cuda(src->buffer) || !ggml_backend_buffer_is_cuda(dst->buffer)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -2841,17 +2838,14 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
|
||||
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
|
||||
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
|
||||
|
||||
if ((copy_from_host && cuda_ctx_dst->device != buf_ctx_dst->device) ||
|
||||
!copy_from_host && (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device)) {
|
||||
if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__);
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
if (copy_from_host) {
|
||||
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyHostToDevice, cuda_ctx_dst->stream()));
|
||||
} else if (backend_src != backend_dst) {
|
||||
if (backend_src != backend_dst) {
|
||||
// copy on src stream
|
||||
if (cuda_ctx_src->device == cuda_ctx_dst->device) {
|
||||
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
|
||||
|
||||
@@ -1156,7 +1156,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
||||
case GGML_OP_RWKV_WKV7:
|
||||
return true;
|
||||
case GGML_OP_GATED_DELTA_NET:
|
||||
return op->src[2]->ne[0] % 32 == 0;
|
||||
return has_simdgroup_reduction && op->src[2]->ne[0] % 32 == 0;
|
||||
case GGML_OP_SOLVE_TRI:
|
||||
case GGML_OP_MUL_MAT:
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
|
||||
@@ -3006,7 +3006,7 @@ kernel void kernel_l2_norm_impl(
|
||||
sumf = shmem_f32[tiisg];
|
||||
sumf = simd_sum(sumf);
|
||||
|
||||
const float scale = 1.0f/sqrt(max(sumf, args.eps));
|
||||
const float scale = 1.0f/max(sqrt(sumf), args.eps);
|
||||
|
||||
for (int i00 = tpitg.x; i00 < args.ne00; i00 += ntg.x) {
|
||||
y[i00] = x[i00] * scale;
|
||||
|
||||
@@ -115,6 +115,17 @@ static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; }
|
||||
#define VK_LOG_DEBUG(msg) ((void) 0)
|
||||
#endif // GGML_VULKAN_DEBUG
|
||||
|
||||
// Synchronization tracing for multi-GPU deadlock debugging.
|
||||
// Enable with GGML_VK_SYNC_LOG=1 environment variable.
|
||||
static bool vk_sync_log_enabled = false;
|
||||
#define VK_SYNC_LOG(dev_name, msg) do { \
|
||||
if (vk_sync_log_enabled) { \
|
||||
fprintf(stderr, "[VK_SYNC %s tid=%lu] %s\n", \
|
||||
(dev_name).c_str(), (unsigned long)std::hash<std::thread::id>{}(std::this_thread::get_id()) % 10000, \
|
||||
((std::ostringstream&)(std::ostringstream() << msg)).str().c_str()); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
struct ggml_backend_vk_context;
|
||||
|
||||
#define MAX_PARAMETER_COUNT 12
|
||||
@@ -191,6 +202,7 @@ struct vk_queue;
|
||||
|
||||
struct vk_command_buffer {
|
||||
vk::CommandBuffer buf;
|
||||
uint64_t use_counter = 0;
|
||||
bool in_use = false;
|
||||
};
|
||||
|
||||
@@ -254,7 +266,7 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
|
||||
class vk_memory_logger;
|
||||
class vk_perf_logger;
|
||||
static void ggml_vk_destroy_buffer(vk_buffer& buf);
|
||||
static void ggml_vk_synchronize(ggml_backend_vk_context * ctx);
|
||||
static void ggml_vk_synchronize(ggml_backend_vk_context * ctx, const char * caller = "unknown");
|
||||
|
||||
static constexpr uint32_t mul_mat_vec_max_cols = 8;
|
||||
static constexpr uint32_t p021_max_gqa_ratio = 8;
|
||||
@@ -938,21 +950,26 @@ struct vk_subbuffer {
|
||||
}
|
||||
};
|
||||
|
||||
// vk_event is used for the event-related backend interfaces. It uses 'event' for
|
||||
// event_wait and 'fence' for event_synchronize. Polling on an event for
|
||||
// event_synchronize wouldn't be sufficient to wait for command buffers to complete,
|
||||
// and would lead to validation errors.
|
||||
struct vk_event {
|
||||
vk::Event event;
|
||||
vk::Fence fence;
|
||||
vk_command_buffer* cmd_buffer = nullptr;
|
||||
};
|
||||
|
||||
struct vk_semaphore {
|
||||
vk::Semaphore s;
|
||||
uint64_t value;
|
||||
};
|
||||
|
||||
// vk_event is used for the event-related backend interfaces. It uses vk::Events for
|
||||
// event_wait and a timeline semaphore for event_synchronize. Polling on an event for
|
||||
// event_synchronize wouldn't be sufficient to wait for command buffers to complete,
|
||||
// and would lead to validation errors.
|
||||
struct vk_event {
|
||||
std::vector<vk::Event> events_free; // Events available for reuse
|
||||
std::vector<vk::Event> events_submitted; // Events that are fully submitted and can be reused on next synchronize
|
||||
vk::Event event;
|
||||
bool has_event;
|
||||
|
||||
vk_semaphore tl_semaphore;
|
||||
vk_command_buffer* cmd_buffer = nullptr;
|
||||
uint64_t cmd_buffer_use_counter = 0;
|
||||
};
|
||||
|
||||
struct vk_submission {
|
||||
vk_command_buffer* buffer = nullptr;
|
||||
std::vector<vk_semaphore> wait_semaphores;
|
||||
@@ -2319,7 +2336,7 @@ static vk_command_buffer* ggml_vk_create_cmd_buffer(vk_device& device, vk_comman
|
||||
vk::CommandBufferLevel::ePrimary,
|
||||
1);
|
||||
const std::vector<vk::CommandBuffer> cmd_buffers = device->device.allocateCommandBuffers(command_buffer_alloc_info);
|
||||
p.cmd_buffers.push_back({ cmd_buffers.front(), true });
|
||||
p.cmd_buffers.push_back({ cmd_buffers.front(), 0, true });
|
||||
return &p.cmd_buffers[p.cmd_buffers.size()-1];
|
||||
}
|
||||
|
||||
@@ -2788,6 +2805,15 @@ static void ggml_vk_sync_buffers(ggml_backend_vk_context* ctx, vk_context& subct
|
||||
);
|
||||
}
|
||||
|
||||
static void ggml_vk_reset_event(vk_context& ctx, vk::Event& event) {
|
||||
VK_LOG_DEBUG("ggml_vk_set_event()");
|
||||
|
||||
ctx->s->buffer->buf.resetEvent(
|
||||
event,
|
||||
ctx->p->q->stage_flags
|
||||
);
|
||||
}
|
||||
|
||||
static void ggml_vk_set_event(vk_context& ctx, vk::Event& event) {
|
||||
VK_LOG_DEBUG("ggml_vk_set_event()");
|
||||
|
||||
@@ -5731,6 +5757,7 @@ static void ggml_vk_instance_init() {
|
||||
vk_perf_logger_concurrent = getenv("GGML_VK_PERF_LOGGER_CONCURRENT") != nullptr;
|
||||
vk_enable_sync_logger = getenv("GGML_VK_SYNC_LOGGER") != nullptr;
|
||||
vk_memory_logger_enabled = getenv("GGML_VK_MEMORY_LOGGER") != nullptr;
|
||||
vk_sync_log_enabled = getenv("GGML_VK_SYNC_LOG") != nullptr;
|
||||
const char* GGML_VK_PIPELINE_STATS = getenv("GGML_VK_PIPELINE_STATS");
|
||||
if (GGML_VK_PIPELINE_STATS != nullptr) {
|
||||
vk_pipeline_stats_filter = GGML_VK_PIPELINE_STATS;
|
||||
@@ -6392,6 +6419,7 @@ static vk_subbuffer ggml_vk_tensor_subbuffer(
|
||||
static vk_command_buffer* ggml_vk_get_or_create_cmd_buffer(vk_device& device, vk_command_pool& pool) {
|
||||
for (auto& cmd_buffer : pool.cmd_buffers) {
|
||||
if (!cmd_buffer.in_use) {
|
||||
cmd_buffer.use_counter++;
|
||||
cmd_buffer.in_use = true;
|
||||
return &cmd_buffer;
|
||||
}
|
||||
@@ -6495,11 +6523,13 @@ static void ggml_vk_ctx_begin(vk_device& device, vk_context& subctx) {
|
||||
subctx->s = subctx->seqs[subctx->seqs.size() - 1].data();
|
||||
}
|
||||
|
||||
static vk_context ggml_vk_get_compute_ctx(ggml_backend_vk_context * ctx) {
|
||||
static vk_context ggml_vk_get_compute_ctx(ggml_backend_vk_context * ctx, const char * caller = "unknown") {
|
||||
if (!ctx->compute_ctx.expired()) {
|
||||
return ctx->compute_ctx.lock();
|
||||
}
|
||||
|
||||
VK_SYNC_LOG(ctx->name, "get_compute_ctx: CREATING NEW compute_ctx, caller=" << caller);
|
||||
|
||||
vk_context result = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||
|
||||
ctx->compute_ctx = result;
|
||||
@@ -12652,7 +12682,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx, vk_contex
|
||||
ggml_vk_ctx_end(subctx);
|
||||
ggml_vk_submit(subctx, {});
|
||||
ctx->submit_pending = true;
|
||||
ggml_vk_synchronize(ctx);
|
||||
ggml_vk_synchronize(ctx, "preallocate_buffers");
|
||||
GGML_ASSERT(ctx->compute_ctx.expired());
|
||||
ggml_vk_ctx_begin(ctx->device, subctx);
|
||||
ctx->compute_ctx = subctx;
|
||||
@@ -12729,7 +12759,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||
}
|
||||
}
|
||||
|
||||
vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx, "build_graph");
|
||||
|
||||
{
|
||||
// This logic detects dependencies between modes in the graph and calls ggml_vk_sync_buffers
|
||||
@@ -13228,7 +13258,7 @@ static void ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph *
|
||||
ctx->submit_pending = true;
|
||||
|
||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||
ggml_vk_synchronize(ctx);
|
||||
ggml_vk_synchronize(ctx, "check_results");
|
||||
ggml_vk_check_results_1(ctx, cgraph, tensor_idx);
|
||||
#endif
|
||||
}
|
||||
@@ -13287,7 +13317,7 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
|
||||
// discard any unsubmitted command buffers
|
||||
ctx->compute_ctx.reset();
|
||||
// wait for any pending command buffers to finish
|
||||
ggml_vk_synchronize(ctx);
|
||||
ggml_vk_synchronize(ctx, "cleanup");
|
||||
|
||||
ggml_vk_graph_cleanup(ctx);
|
||||
|
||||
@@ -13606,6 +13636,7 @@ static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_type(ggml_b
|
||||
static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
VK_LOG_DEBUG("ggml_backend_vk_set_tensor_async(" << size << ")");
|
||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||
VK_SYNC_LOG(ctx->name, "set_tensor_async: tensor=" << tensor->name << " size=" << size);
|
||||
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
||||
|
||||
if (size == 0) {
|
||||
@@ -13626,7 +13657,7 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor
|
||||
cpy_ctx = ctx->transfer_ctx.lock();
|
||||
}
|
||||
} else {
|
||||
cpy_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
cpy_ctx = ggml_vk_get_compute_ctx(ctx, "set_tensor_async");
|
||||
}
|
||||
|
||||
vk_buffer buf = buf_ctx->dev_buffer;
|
||||
@@ -13646,13 +13677,14 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor
|
||||
|
||||
cpy_ctx->s->buffer->buf.copyBuffer(ctx->sync_staging->buffer, buf->buffer, { buffer_cpy });
|
||||
deferred_memcpy(ctx->sync_staging->ptr, data, size, &cpy_ctx->in_memcpys);
|
||||
ggml_vk_synchronize(ctx);
|
||||
ggml_vk_synchronize(ctx, "set_tensor_async_staging");
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
VK_LOG_DEBUG("ggml_backend_vk_get_tensor_async(" << size << ")");
|
||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||
VK_SYNC_LOG(ctx->name, "get_tensor_async: tensor=" << tensor->name << " size=" << size);
|
||||
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
||||
|
||||
if (size == 0) {
|
||||
@@ -13661,7 +13693,7 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_
|
||||
|
||||
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;
|
||||
|
||||
vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx, "get_tensor_async");
|
||||
|
||||
vk_buffer buf = buf_ctx->dev_buffer;
|
||||
|
||||
@@ -13680,13 +13712,17 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_
|
||||
|
||||
compute_ctx->s->buffer->buf.copyBuffer(buf->buffer, ctx->sync_staging->buffer, { buffer_cpy });
|
||||
deferred_memcpy(data, ctx->sync_staging->ptr, size, &compute_ctx->out_memcpys);
|
||||
ggml_vk_synchronize(ctx);
|
||||
ggml_vk_synchronize(ctx, "get_tensor_async_staging");
|
||||
}
|
||||
}
|
||||
|
||||
static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
|
||||
VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async(" << src << " -> " << dst << ", size=" << ggml_nbytes(src) << ")");
|
||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend_dst->context;
|
||||
VK_SYNC_LOG(ctx->name, "cpy_tensor_async: BEGIN " << src->name << " -> " << dst->name
|
||||
<< " size=" << ggml_nbytes(src)
|
||||
<< " src_is_vk=" << ggml_backend_buffer_is_vk(src->buffer)
|
||||
<< " src_is_host=" << ggml_backend_buffer_is_host(src->buffer));
|
||||
|
||||
// Skip zero-size tensors
|
||||
if (ggml_nbytes(src) == 0) {
|
||||
@@ -13708,7 +13744,7 @@ static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend_src, ggml_ba
|
||||
return false;
|
||||
}
|
||||
|
||||
vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx, "cpy_tensor_async_vk2vk");
|
||||
|
||||
ggml_vk_buffer_copy_async(compute_ctx, dst_buf, vk_tensor_offset(dst) + dst->view_offs,
|
||||
src_buf_ctx->dev_buffer, vk_tensor_offset(src) + src->view_offs,
|
||||
@@ -13721,6 +13757,7 @@ static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend_src, ggml_ba
|
||||
size_t pinned_offset = 0;
|
||||
ggml_vk_host_get(ctx->device, src->data, pinned_buf, pinned_offset);
|
||||
if (pinned_buf == nullptr) {
|
||||
VK_SYNC_LOG(ctx->name, "cpy_tensor_async: host src not pinned, returning false");
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -13730,27 +13767,34 @@ static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend_src, ggml_ba
|
||||
cpy_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
|
||||
ctx->transfer_ctx = cpy_ctx;
|
||||
ggml_vk_ctx_begin(ctx->device, cpy_ctx);
|
||||
VK_SYNC_LOG(ctx->name, "cpy_tensor_async: created new transfer_ctx");
|
||||
} else {
|
||||
cpy_ctx = ctx->transfer_ctx.lock();
|
||||
}
|
||||
VK_SYNC_LOG(ctx->name, "cpy_tensor_async: using transfer queue for host->dev copy");
|
||||
} else {
|
||||
cpy_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
cpy_ctx = ggml_vk_get_compute_ctx(ctx, "cpy_tensor_async_host2dev");
|
||||
VK_SYNC_LOG(ctx->name, "cpy_tensor_async: using compute queue for host->dev copy");
|
||||
}
|
||||
|
||||
return ggml_vk_buffer_write_async(cpy_ctx, dst_buf,
|
||||
bool ret = ggml_vk_buffer_write_async(cpy_ctx, dst_buf,
|
||||
vk_tensor_offset(dst) + dst->view_offs,
|
||||
src->data, ggml_nbytes(src));
|
||||
VK_SYNC_LOG(ctx->name, "cpy_tensor_async: host->dev write_async returned " << ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
GGML_UNUSED(backend_src);
|
||||
return false;
|
||||
}
|
||||
|
||||
static void ggml_vk_synchronize(ggml_backend_vk_context * ctx) {
|
||||
static void ggml_vk_synchronize(ggml_backend_vk_context * ctx, const char * caller) {
|
||||
VK_LOG_DEBUG("ggml_vk_synchronize()");
|
||||
|
||||
bool do_transfer = !ctx->compute_ctx.expired();
|
||||
|
||||
VK_SYNC_LOG(ctx->name, "vk_synchronize: do_transfer=" << do_transfer << " submit_pending=" << ctx->submit_pending << " caller=" << caller);
|
||||
|
||||
if (ggml_vk_submit_transfer_ctx(ctx)) {
|
||||
ctx->submit_pending = true;
|
||||
}
|
||||
@@ -13774,6 +13818,7 @@ static void ggml_vk_synchronize(ggml_backend_vk_context * ctx) {
|
||||
}
|
||||
|
||||
if (ctx->submit_pending) {
|
||||
VK_SYNC_LOG(ctx->name, "vk_synchronize: waiting for fence...");
|
||||
if (ctx->device->async_use_transfer_queue && ctx->transfer_semaphore_last_submitted < ctx->transfer_semaphore.value) {
|
||||
vk::TimelineSemaphoreSubmitInfo tl_info{
|
||||
1, &ctx->transfer_semaphore.value,
|
||||
@@ -13797,6 +13842,7 @@ static void ggml_vk_synchronize(ggml_backend_vk_context * ctx) {
|
||||
ctx->submit_pending = false;
|
||||
if (cmd_buf) {
|
||||
cmd_buf->in_use = false;
|
||||
cmd_buf->buf.reset();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13812,9 +13858,16 @@ static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
|
||||
VK_LOG_DEBUG("ggml_backend_vk_synchronize()");
|
||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||
|
||||
ggml_vk_synchronize(ctx);
|
||||
VK_SYNC_LOG(ctx->name, "synchronize: BEGIN submit_pending=" << ctx->submit_pending
|
||||
<< " compute_ctx_alive=" << !ctx->compute_ctx.expired());
|
||||
|
||||
ggml_vk_synchronize(ctx, "backend_synchronize");
|
||||
|
||||
VK_SYNC_LOG(ctx->name, "synchronize: fence done, calling graph_cleanup");
|
||||
|
||||
ggml_vk_graph_cleanup(ctx);
|
||||
|
||||
VK_SYNC_LOG(ctx->name, "synchronize: DONE");
|
||||
}
|
||||
|
||||
static bool ggml_vk_is_empty(ggml_tensor * node) {
|
||||
@@ -14233,6 +14286,10 @@ static int32_t find_first_set(uint32_t x) {
|
||||
static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
|
||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||
VK_SYNC_LOG(ctx->name, "graph_compute: BEGIN nodes=" << cgraph->n_nodes
|
||||
<< " compute_ctx_alive=" << !ctx->compute_ctx.expired()
|
||||
<< " transfer_ctx_alive=" << !ctx->transfer_ctx.expired()
|
||||
<< " submit_pending=" << ctx->submit_pending);
|
||||
|
||||
if (vk_instance.debug_utils_support) {
|
||||
vk::DebugUtilsLabelEXT dul = {};
|
||||
@@ -14285,7 +14342,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
std::fill(ctx->query_node_idx.begin(), ctx->query_node_idx.end(), 0);
|
||||
|
||||
GGML_ASSERT(ctx->compute_ctx.expired());
|
||||
compute_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
compute_ctx = ggml_vk_get_compute_ctx(ctx, "graph_compute_perf");
|
||||
ctx->query_idx = 0;
|
||||
compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
|
||||
}
|
||||
@@ -14295,7 +14352,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
|
||||
if (ctx->prealloc_size_add_rms_partials) {
|
||||
ggml_vk_preallocate_buffers(ctx, nullptr);
|
||||
compute_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
compute_ctx = ggml_vk_get_compute_ctx(ctx, "graph_compute_rms_partials");
|
||||
// initialize partial sums to zero.
|
||||
ggml_vk_buffer_memset_async(compute_ctx, ctx->prealloc_add_rms_partials, 0, 0, ctx->prealloc_size_add_rms_partials);
|
||||
ggml_vk_sync_buffers(ctx, compute_ctx);
|
||||
@@ -14518,7 +14575,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
bool enqueued = ggml_vk_build_graph(ctx, cgraph, i, cgraph->nodes[submit_node_idx], submit_node_idx, i + ctx->num_additional_fused_ops >= last_node, almost_ready, submit);
|
||||
|
||||
if (vk_perf_logger_enabled && enqueued) {
|
||||
compute_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
compute_ctx = ggml_vk_get_compute_ctx(ctx, "graph_compute_perf_timestamp");
|
||||
if (!vk_perf_logger_concurrent) {
|
||||
// track a single node/fusion for the current query
|
||||
ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i];
|
||||
@@ -14601,9 +14658,13 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
}
|
||||
|
||||
if (!ctx->device->support_async) {
|
||||
ggml_vk_synchronize(ctx);
|
||||
VK_SYNC_LOG(ctx->name, "graph_compute: sync (support_async=false)");
|
||||
ggml_vk_synchronize(ctx, "graph_compute_no_async");
|
||||
}
|
||||
|
||||
VK_SYNC_LOG(ctx->name, "graph_compute: DONE submit_pending=" << ctx->submit_pending
|
||||
<< " compute_ctx_alive=" << !ctx->compute_ctx.expired());
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
|
||||
UNUSED(backend);
|
||||
@@ -14853,24 +14914,45 @@ static void ggml_backend_vk_event_record(ggml_backend_t backend, ggml_backend_ev
|
||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||
vk_event *vkev = (vk_event *)event->context;
|
||||
|
||||
VK_SYNC_LOG(ctx->name, "event_record: BEGIN event=" << event
|
||||
<< " compute_ctx_alive=" << !ctx->compute_ctx.expired()
|
||||
<< " submit_pending=" << ctx->submit_pending);
|
||||
|
||||
ggml_vk_submit_transfer_ctx(ctx);
|
||||
|
||||
vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx, "event_record");
|
||||
auto* cmd_buf = compute_ctx->s->buffer; // retrieve pointer before it gets reset
|
||||
|
||||
// the backend interface doesn't have an explicit reset, so reset it here
|
||||
// before we record the command to set it
|
||||
ctx->device->device.resetEvent(vkev->event);
|
||||
ctx->device->device.resetFences({ vkev->fence });
|
||||
if (vkev->has_event) {
|
||||
// Move existing event into submitted
|
||||
vkev->events_submitted.push_back(vkev->event);
|
||||
}
|
||||
|
||||
// Grab the next event and record it, create one if necessary
|
||||
if (vkev->events_free.empty()) {
|
||||
VK_SYNC_LOG(ctx->name, "event_record: create new event");
|
||||
vkev->event = ctx->device->device.createEvent({});
|
||||
} else {
|
||||
vkev->event = vkev->events_free.back();
|
||||
vkev->events_free.pop_back();
|
||||
}
|
||||
|
||||
vkev->has_event = true;
|
||||
|
||||
ggml_vk_set_event(compute_ctx, vkev->event);
|
||||
|
||||
vkev->tl_semaphore.value++;
|
||||
compute_ctx->s->signal_semaphores.push_back(vkev->tl_semaphore);
|
||||
ggml_vk_ctx_end(compute_ctx);
|
||||
|
||||
ggml_vk_submit(compute_ctx, {vkev->fence});
|
||||
VK_SYNC_LOG(ctx->name, "event_record: set event, submit cmd_buf=" << cmd_buf->buf << ", vk_event=" << (VkEvent)vkev->event);
|
||||
ggml_vk_submit(compute_ctx, {});
|
||||
ctx->submit_pending = true;
|
||||
vkev->cmd_buffer = cmd_buf;
|
||||
vkev->cmd_buffer_use_counter = cmd_buf->use_counter;
|
||||
ctx->compute_ctx.reset();
|
||||
|
||||
VK_SYNC_LOG(ctx->name, "event_record: DONE event=" << event);
|
||||
}
|
||||
|
||||
static void ggml_backend_vk_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
|
||||
@@ -14878,11 +14960,16 @@ static void ggml_backend_vk_event_wait(ggml_backend_t backend, ggml_backend_even
|
||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||
vk_event *vkev = (vk_event *)event->context;
|
||||
|
||||
vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx, "event_wait");
|
||||
|
||||
ggml_vk_wait_events(compute_ctx, {vkev->event});
|
||||
ggml_vk_ctx_end(compute_ctx);
|
||||
ctx->compute_ctx.reset();
|
||||
if (vkev->has_event) {
|
||||
// Wait for latest event
|
||||
VK_SYNC_LOG(ctx->name, "event_wait: recording vkCmdWaitEvents, event=" << event
|
||||
<< " vk_event=" << (VkEvent)vkev->event
|
||||
<< " compute_ctx_alive=" << !ctx->compute_ctx.expired());
|
||||
ggml_vk_wait_events(compute_ctx, { vkev->event });
|
||||
VK_SYNC_LOG(ctx->name, "event_wait: DONE (cmd recorded, not yet submitted)");
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: enable async and synchronize
|
||||
@@ -15672,10 +15759,13 @@ static ggml_backend_event_t ggml_backend_vk_device_event_new(ggml_backend_dev_t
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// The event/fence is expected to initially be in the signaled state.
|
||||
vkev->event = device->device.createEvent({});
|
||||
vkev->fence = device->device.createFence({vk::FenceCreateFlagBits::eSignaled});
|
||||
device->device.setEvent(vkev->event);
|
||||
// No events initially, they get created on demand
|
||||
vkev->has_event = false;
|
||||
|
||||
vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 };
|
||||
vk::SemaphoreCreateInfo ci{};
|
||||
ci.setPNext(&tci);
|
||||
vkev->tl_semaphore = { device->device.createSemaphore(ci), 0 };
|
||||
|
||||
return new ggml_backend_event {
|
||||
/* .device = */ dev,
|
||||
@@ -15689,8 +15779,16 @@ static void ggml_backend_vk_device_event_free(ggml_backend_dev_t dev, ggml_backe
|
||||
|
||||
vk_event *vkev = (vk_event *)event->context;
|
||||
|
||||
device->device.destroyFence(vkev->fence);
|
||||
device->device.destroyEvent(vkev->event);
|
||||
device->device.destroySemaphore(vkev->tl_semaphore.s);
|
||||
for (auto& event : vkev->events_free) {
|
||||
device->device.destroyEvent(event);
|
||||
}
|
||||
for (auto& event : vkev->events_submitted) {
|
||||
device->device.destroyEvent(event);
|
||||
}
|
||||
if (vkev->has_event) {
|
||||
device->device.destroyEvent(vkev->event);
|
||||
}
|
||||
delete vkev;
|
||||
delete event;
|
||||
}
|
||||
@@ -15701,10 +15799,33 @@ static void ggml_backend_vk_device_event_synchronize(ggml_backend_dev_t dev, ggm
|
||||
auto device = ggml_vk_get_device(ctx->device);
|
||||
vk_event *vkev = (vk_event *)event->context;
|
||||
|
||||
VK_CHECK(device->device.waitForFences({ vkev->fence }, true, UINT64_MAX), "event_synchronize");
|
||||
// Finished using current command buffer so we flag for reuse
|
||||
if (vkev->cmd_buffer) {
|
||||
vkev->cmd_buffer->in_use = false;
|
||||
// Only do something if the event has actually been used
|
||||
if (vkev->has_event) {
|
||||
VK_SYNC_LOG(device->name, "event_synchronize: BEGIN waiting on timeline semaphore, event=" << event);
|
||||
vk::Semaphore sem = vkev->tl_semaphore.s;
|
||||
uint64_t val = vkev->tl_semaphore.value;
|
||||
vk::SemaphoreWaitInfo swi{vk::SemaphoreWaitFlags{}, sem, val};
|
||||
VK_CHECK(device->device.waitSemaphores(swi, UINT64_MAX), "event_synchronize");
|
||||
|
||||
VK_SYNC_LOG(device->name, "event_synchronize: timeline semaphore wait done, event=" << event);
|
||||
|
||||
// Reset and move submitted events
|
||||
for (auto& event : vkev->events_submitted) {
|
||||
VK_SYNC_LOG(device->name, "event_synchronize: resetting vkevent=" << event);
|
||||
device->device.resetEvent(event);
|
||||
}
|
||||
vkev->events_free.insert(vkev->events_free.end(), vkev->events_submitted.begin(), vkev->events_submitted.end());
|
||||
vkev->events_submitted.clear();
|
||||
|
||||
// Finished using current command buffer so we flag for reuse
|
||||
if (vkev->cmd_buffer) {
|
||||
// Only flag for reuse if it hasn't been reused already
|
||||
if (vkev->cmd_buffer_use_counter == vkev->cmd_buffer->use_counter) {
|
||||
vkev->cmd_buffer->in_use = false;
|
||||
vkev->cmd_buffer->buf.reset();
|
||||
}
|
||||
vkev->cmd_buffer = nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ import os
|
||||
import sys
|
||||
import subprocess
|
||||
|
||||
HTTPLIB_VERSION = "refs/tags/v0.37.0"
|
||||
HTTPLIB_VERSION = "refs/tags/v0.37.1"
|
||||
|
||||
vendor = {
|
||||
"https://github.com/nlohmann/json/releases/latest/download/json.hpp": "vendor/nlohmann/json.hpp",
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include "llama-memory.h"
|
||||
#include "llama-mmap.h"
|
||||
#include "llama-model.h"
|
||||
#include "llama-ext.h"
|
||||
|
||||
#include <cinttypes>
|
||||
#include <cmath>
|
||||
@@ -341,6 +342,14 @@ llama_context::llama_context(
|
||||
|
||||
if (cparams.pipeline_parallel) {
|
||||
LLAMA_LOG_INFO("%s: pipeline parallelism enabled\n", __func__);
|
||||
|
||||
if (!graph_reuse_disable) {
|
||||
// TODO: figure out a way to make graph reuse work with pipeline parallelism
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/20463
|
||||
LLAMA_LOG_WARN("%s: graph reuse is currently not compatible with pipeline parallelism - disabling\n", __func__);
|
||||
|
||||
graph_reuse_disable = true;
|
||||
}
|
||||
}
|
||||
|
||||
sched_reserve();
|
||||
@@ -3129,6 +3138,19 @@ uint32_t llama_get_sampled_probs_count_ith(llama_context * ctx, int32_t i) {
|
||||
return static_cast<uint32_t>(ctx->get_sampled_probs_count(i));
|
||||
}
|
||||
|
||||
struct ggml_cgraph * llama_graph_reserve(
|
||||
struct llama_context * ctx,
|
||||
uint32_t n_tokens,
|
||||
uint32_t n_seqs,
|
||||
uint32_t n_outputs) {
|
||||
auto * memory = ctx->get_memory();
|
||||
llama_memory_context_ptr mctx;
|
||||
if (memory) {
|
||||
mctx = memory->init_full();
|
||||
}
|
||||
return ctx->graph_reserve(n_tokens, n_seqs, n_outputs, mctx.get());
|
||||
}
|
||||
|
||||
// llama adapter API
|
||||
|
||||
int32_t llama_set_adapters_lora(
|
||||
|
||||
12
src/llama-ext.h
Normal file
12
src/llama-ext.h
Normal file
@@ -0,0 +1,12 @@
|
||||
#pragma once
|
||||
|
||||
#include "llama-context.h"
|
||||
#include "ggml.h"
|
||||
#include "stdint.h"
|
||||
|
||||
// Reserve a new compute graph. It is valid until the next call to llama_graph_reserve.
|
||||
LLAMA_API struct ggml_cgraph * llama_graph_reserve(
|
||||
struct llama_context * ctx,
|
||||
uint32_t n_tokens,
|
||||
uint32_t n_seqs,
|
||||
uint32_t n_outputs);
|
||||
@@ -1160,13 +1160,13 @@ struct llama_grammar * llama_grammar_init_impl(
|
||||
// if there is a grammar, parse it
|
||||
// rules will be empty (default) if there are parse errors
|
||||
if (!parser.parse(grammar_str) || parser.rules.empty()) {
|
||||
fprintf(stderr, "%s: failed to parse grammar\n", __func__);
|
||||
LLAMA_LOG_ERROR("failed to parse grammar\n");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Ensure that there is a "root" node.
|
||||
if (parser.symbol_ids.find("root") == parser.symbol_ids.end()) {
|
||||
fprintf(stderr, "%s: grammar does not contain a 'root' symbol\n", __func__);
|
||||
// Ensure that the grammar contains the start symbol
|
||||
if (parser.symbol_ids.find(grammar_root) == parser.symbol_ids.end()) {
|
||||
LLAMA_LOG_ERROR("grammar does not contain a '%s' symbol\n", grammar_root);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
@@ -1195,7 +1195,7 @@ struct llama_grammar * llama_grammar_init_impl(
|
||||
continue;
|
||||
}
|
||||
if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
|
||||
LLAMA_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu", i);
|
||||
LLAMA_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu\n", i);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -260,6 +260,7 @@ endif()
|
||||
set(LLAMA_TEST_NAME test-mtmd-c-api)
|
||||
llama_build_and_test(test-mtmd-c-api.c)
|
||||
target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd)
|
||||
unset(LLAMA_TEST_NAME)
|
||||
|
||||
# GGUF model data fetcher library for tests that need real model metadata
|
||||
# Only compile when cpp-httplib has SSL support (CPPHTTPLIB_OPENSSL_SUPPORT)
|
||||
@@ -284,4 +285,5 @@ target_link_libraries(${TEST_TARGET} PRIVATE llama)
|
||||
llama_build_and_test(test-alloc.cpp)
|
||||
target_include_directories(test-alloc PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
|
||||
|
||||
|
||||
llama_build(export-graph-ops.cpp)
|
||||
target_include_directories(export-graph-ops PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
|
||||
|
||||
169
tests/export-graph-ops.cpp
Normal file
169
tests/export-graph-ops.cpp
Normal file
@@ -0,0 +1,169 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
#include "../src/llama-ext.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <array>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
|
||||
struct input_tensor {
|
||||
ggml_type type;
|
||||
std::array<int64_t, 4> ne;
|
||||
std::array<size_t, 4> nb;
|
||||
|
||||
input_tensor(ggml_type type, int64_t * ne, size_t * nb): type(type) {
|
||||
memcpy(this->ne.data(), ne, 4 * sizeof(int64_t));
|
||||
memcpy(this->nb.data(), nb, 4 * sizeof(size_t));
|
||||
}
|
||||
|
||||
bool operator<(const input_tensor &b) const {
|
||||
return std::tie(type, ne, nb) <
|
||||
std::tie(b.type, b.ne, b.nb);
|
||||
}
|
||||
|
||||
void serialize(std::ostream& out) const {
|
||||
out << type << ' ';
|
||||
for (size_t i = 0; i < 4; i++) {
|
||||
out << ne[i] << ' ';
|
||||
}
|
||||
for (size_t i = 0; i < 4; i++) {
|
||||
out << nb[i] << ' ';
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct test_object {
|
||||
ggml_op op;
|
||||
ggml_type type;
|
||||
std::array<int64_t, 4> ne;
|
||||
std::vector<int32_t> op_params;
|
||||
std::vector<input_tensor> sources;
|
||||
std::string name;
|
||||
|
||||
void serialize(std::ostream& out) const {
|
||||
out << op << ' ' << type << ' ';
|
||||
for (size_t i = 0; i < 4; i++) {
|
||||
out << ne[i] << ' ';
|
||||
}
|
||||
|
||||
out << op_params.size() << ' ';
|
||||
for (size_t i = 0; i < op_params.size(); i++) {
|
||||
out << op_params[i] << ' ';
|
||||
}
|
||||
|
||||
out << sources.size() << ' ';
|
||||
for (size_t s = 0; s < sources.size(); s++) {
|
||||
sources[s].serialize(out);
|
||||
}
|
||||
|
||||
if (!name.empty()) {
|
||||
out << name;
|
||||
} else {
|
||||
out << '-';
|
||||
}
|
||||
|
||||
out << '\n';
|
||||
}
|
||||
|
||||
bool operator<(const test_object &b) const {
|
||||
return std::tie(op, type, ne, op_params, sources) <
|
||||
std::tie(b.op, b.type, b.ne, b.op_params, b.sources);
|
||||
}
|
||||
};
|
||||
|
||||
static void extract_graph_ops(ggml_cgraph * cgraph, const char * label, std::set<test_object> & tests) {
|
||||
int n_nodes = ggml_graph_n_nodes(cgraph);
|
||||
int n_skipped = 0;
|
||||
int n_before = (int) tests.size();
|
||||
for (int i = 0; i < n_nodes; i++) {
|
||||
ggml_tensor * node = ggml_graph_node(cgraph, i);
|
||||
|
||||
if (node->op == GGML_OP_NONE || node->op == GGML_OP_VIEW || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE) {
|
||||
n_skipped++;
|
||||
continue;
|
||||
}
|
||||
|
||||
test_object test;
|
||||
|
||||
test.op = node->op;
|
||||
test.type = node->type;
|
||||
memcpy(&test.ne, node->ne, 4 * sizeof(int64_t));
|
||||
|
||||
test.op_params.resize(GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
||||
memcpy(test.op_params.data(), node->op_params, GGML_MAX_OP_PARAMS);
|
||||
|
||||
for (size_t s = 0; s < GGML_MAX_SRC; s++) {
|
||||
if (node->src[s] == nullptr) {
|
||||
break;
|
||||
}
|
||||
|
||||
test.sources.emplace_back(node->src[s]->type, node->src[s]->ne, node->src[s]->nb);
|
||||
}
|
||||
|
||||
test.name = node->name;
|
||||
tests.insert(test);
|
||||
}
|
||||
|
||||
int n_new = (int) tests.size() - n_before;
|
||||
LOG_INF("%s: %d unique ops, %d total nodes, %d skipped (view ops)\n",
|
||||
label, n_new, n_nodes, n_skipped);
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
common_params params;
|
||||
params.out_file = "tests.txt";
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
common_init();
|
||||
|
||||
// Load CPU-only
|
||||
ggml_backend_dev_t cpu_device = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
params.devices = { cpu_device, nullptr };
|
||||
params.fit_params = false;
|
||||
params.n_gpu_layers = 0;
|
||||
|
||||
params.warmup = false;
|
||||
|
||||
auto init_result = common_init_from_params(params);
|
||||
|
||||
llama_context * ctx = init_result->context();
|
||||
|
||||
const uint32_t n_seqs = llama_n_seq_max(ctx);
|
||||
const uint32_t n_tokens = std::min(llama_n_ctx(ctx), llama_n_ubatch(ctx));
|
||||
|
||||
std::set<test_object> tests;
|
||||
|
||||
auto * gf_pp = llama_graph_reserve(ctx, n_tokens, n_seqs, n_tokens);
|
||||
if (!gf_pp) {
|
||||
throw std::runtime_error("failed to reserve prompt processing graph");
|
||||
}
|
||||
extract_graph_ops(gf_pp, "pp", tests);
|
||||
|
||||
auto * gf_tg = llama_graph_reserve(ctx, n_seqs, n_seqs, n_seqs);
|
||||
if (!gf_tg) {
|
||||
throw std::runtime_error("failed to reserve token generation graph");
|
||||
}
|
||||
extract_graph_ops(gf_tg, "tg", tests);
|
||||
|
||||
LOG_INF("%d unique ops total\n", (int) tests.size());
|
||||
|
||||
std::ofstream f(params.out_file);
|
||||
|
||||
if (!f.is_open()) {
|
||||
throw std::runtime_error("Unable to open output file");
|
||||
}
|
||||
|
||||
for (const auto& test : tests) {
|
||||
test.serialize(f);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -31,10 +31,12 @@
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <future>
|
||||
#include <fstream>
|
||||
#include <memory>
|
||||
#include <random>
|
||||
#include <regex>
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <thread>
|
||||
@@ -6648,6 +6650,236 @@ struct test_diag : public test_case {
|
||||
}
|
||||
};
|
||||
|
||||
// Deserializable generic test case
|
||||
struct input_tensor {
|
||||
ggml_type type;
|
||||
std::array<int64_t, 4> ne;
|
||||
std::array<size_t, 4> nb; // strides (0 = use default contiguous strides)
|
||||
};
|
||||
|
||||
static bool is_non_contiguous(const input_tensor & src) {
|
||||
if (src.nb[0] == 0) {
|
||||
return false;
|
||||
}
|
||||
const size_t default_nb0 = ggml_type_size(src.type);
|
||||
const size_t default_nb1 = default_nb0 * (src.ne[0] / ggml_blck_size(src.type));
|
||||
const size_t default_nb2 = default_nb1 * src.ne[1];
|
||||
const size_t default_nb3 = default_nb2 * src.ne[2];
|
||||
return src.nb[0] != default_nb0 ||
|
||||
src.nb[1] != default_nb1 ||
|
||||
src.nb[2] != default_nb2 ||
|
||||
src.nb[3] != default_nb3;
|
||||
}
|
||||
|
||||
static std::string var_to_str(const std::vector<input_tensor>& sources) {
|
||||
std::ostringstream oss;
|
||||
bool first = true;
|
||||
for (const auto& src : sources) {
|
||||
if (!first) oss << ",";
|
||||
oss << ggml_type_name(src.type) << "[" << src.ne[0] << "," << src.ne[1] << "," << src.ne[2] << "," << src.ne[3] << "]";
|
||||
if (is_non_contiguous(src)) {
|
||||
oss << "nb[" << src.nb[0] << "," << src.nb[1] << "," << src.nb[2] << "," << src.nb[3] << "]";
|
||||
}
|
||||
first = false;
|
||||
}
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
static std::string var_to_str(const std::array<int32_t, GGML_MAX_OP_PARAMS / sizeof(int32_t)>& params) {
|
||||
std::ostringstream oss;
|
||||
oss << "[";
|
||||
bool first = true;
|
||||
for (size_t i = 0; i < params.size(); ++i) {
|
||||
if (params[i] != 0) {
|
||||
if (!first) oss << ",";
|
||||
oss << i << ":" << params[i];
|
||||
first = false;
|
||||
}
|
||||
}
|
||||
oss << "]";
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
|
||||
struct test_generic_op : public test_case {
|
||||
const ggml_op op;
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 4> ne;
|
||||
const std::array<int32_t, GGML_MAX_OP_PARAMS / sizeof(int32_t)> op_params;
|
||||
|
||||
const std::vector<input_tensor> sources;
|
||||
const std::string name;
|
||||
|
||||
std::string vars() override {
|
||||
if (name.empty()) {
|
||||
return VARS_TO_STR4(type, ne, op_params, sources);
|
||||
}
|
||||
|
||||
return VARS_TO_STR5(name, type, ne, op_params, sources);
|
||||
}
|
||||
|
||||
test_generic_op(ggml_op op, ggml_type type, std::array<int64_t, 4> ne,
|
||||
std::array<int32_t, GGML_MAX_OP_PARAMS / sizeof(int32_t)> op_params,
|
||||
std::vector<input_tensor> sources, std::string name = "")
|
||||
: op(op), type(type), ne(ne), op_params(op_params), sources(sources), name(std::move(name)) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
const size_t source_count = std::min(sources.size(), (size_t)GGML_MAX_SRC);
|
||||
|
||||
std::array<ggml_tensor *, GGML_MAX_SRC> source_tensors;
|
||||
for (size_t i = 0; i < source_count; ++i) {
|
||||
const input_tensor& src = sources[i];
|
||||
|
||||
if (is_non_contiguous(src)) {
|
||||
size_t total_size;
|
||||
const size_t blck_size = ggml_blck_size(src.type);
|
||||
if (blck_size == 1) {
|
||||
total_size = ggml_type_size(src.type);
|
||||
for (int d = 0; d < 4; d++) {
|
||||
total_size += (src.ne[d] - 1) * src.nb[d];
|
||||
}
|
||||
} else {
|
||||
total_size = src.ne[0] * src.nb[0] / blck_size;
|
||||
for (int d = 1; d < 4; d++) {
|
||||
total_size += (src.ne[d] - 1) * src.nb[d];
|
||||
}
|
||||
}
|
||||
|
||||
// Convert bytes to elements, padded to block size for quantized types
|
||||
const size_t type_size = ggml_type_size(src.type);
|
||||
size_t backing_elements = (total_size * blck_size + type_size - 1) / type_size;
|
||||
backing_elements = ((backing_elements + blck_size - 1) / blck_size) * blck_size;
|
||||
ggml_tensor * backing = ggml_new_tensor_1d(ctx, src.type, backing_elements);
|
||||
source_tensors[i] = ggml_view_4d(ctx, backing,
|
||||
src.ne[0], src.ne[1], src.ne[2], src.ne[3],
|
||||
src.nb[1], src.nb[2], src.nb[3], 0);
|
||||
// nb[0] does not get set by view_4d, so set it manually
|
||||
source_tensors[i]->nb[0] = src.nb[0];
|
||||
} else {
|
||||
source_tensors[i] = ggml_new_tensor_4d(ctx, src.type, src.ne[0], src.ne[1], src.ne[2], src.ne[3]);
|
||||
}
|
||||
}
|
||||
|
||||
// Ops with an inplace flag create a view of src[0] as their output.
|
||||
bool inplace = false;
|
||||
if (op == GGML_OP_SET || op == GGML_OP_ACC) {
|
||||
inplace = op_params[4] != 0;
|
||||
} else if (op == GGML_OP_ADD_REL_POS) {
|
||||
inplace = op_params[0] != 0;
|
||||
}
|
||||
|
||||
ggml_tensor * out;
|
||||
if (inplace && source_count > 0) {
|
||||
out = ggml_view_tensor(ctx, source_tensors[0]);
|
||||
} else {
|
||||
out = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
|
||||
}
|
||||
out->op = op;
|
||||
for (size_t i = 0; i < source_count; ++i) {
|
||||
out->src[i] = source_tensors[i];
|
||||
}
|
||||
|
||||
memcpy(out->op_params, op_params.data(), GGML_MAX_OP_PARAMS);
|
||||
ggml_set_name(out, "out");
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
double max_nmse_err() override {
|
||||
switch (op) {
|
||||
case GGML_OP_MUL_MAT:
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
case GGML_OP_OUT_PROD:
|
||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_CONV_2D:
|
||||
case GGML_OP_CONV_3D:
|
||||
case GGML_OP_SET_ROWS:
|
||||
case GGML_OP_CPY:
|
||||
return 5e-4;
|
||||
case GGML_OP_SOFT_MAX:
|
||||
return 1e-6;
|
||||
case GGML_OP_RWKV_WKV7:
|
||||
return 5e-3;
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
{
|
||||
// Scale error with kv length to account for accumulating floating point error
|
||||
const int64_t kv = sources[1].ne[1];
|
||||
return 5e-4 * std::max(1.0, kv / 20000.0);
|
||||
}
|
||||
default:
|
||||
return 1e-7;
|
||||
}
|
||||
}
|
||||
|
||||
void initialize_tensors(ggml_context * ctx) override {
|
||||
ggml_tensor * out = ggml_get_tensor(ctx, "out");
|
||||
|
||||
std::random_device rd;
|
||||
std::default_random_engine rng(rd());
|
||||
|
||||
for (size_t i = 0; i < sources.size() && i < GGML_MAX_SRC; i++) {
|
||||
ggml_tensor * t = out->src[i];
|
||||
if (!t) {
|
||||
break;
|
||||
}
|
||||
|
||||
// FLASH_ATTN_EXT: src[3] is the KQ mask
|
||||
if (op == GGML_OP_FLASH_ATTN_EXT && i == 3) {
|
||||
init_tensor_kq_mask(t);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (t->type == GGML_TYPE_I32 || t->type == GGML_TYPE_I64) {
|
||||
if (op == GGML_OP_GET_ROWS || op == GGML_OP_GET_ROWS_BACK) {
|
||||
const int64_t num_rows = sources[0].ne[1];
|
||||
const int64_t nels = ggml_nelements(t);
|
||||
std::vector<int32_t> data(nels);
|
||||
std::uniform_int_distribution<int32_t> dist(0, num_rows - 1);
|
||||
for (int64_t i = 0; i < nels; i++) {
|
||||
data[i] = dist(rng);
|
||||
}
|
||||
ggml_backend_tensor_set(t, data.data(), 0, nels * sizeof(int32_t));
|
||||
} else if (op == GGML_OP_SET_ROWS) {
|
||||
init_set_rows_row_ids(t, ne[1]);
|
||||
} else if (op == GGML_OP_ROPE) {
|
||||
const int mode = op_params[2];
|
||||
const int64_t nels = (mode & GGML_ROPE_TYPE_MROPE) ? ne[2] * 4 : ne[2];
|
||||
std::vector<int32_t> data(nels);
|
||||
std::uniform_int_distribution<int32_t> dist(0, ne[2] - 1);
|
||||
for (int64_t i = 0; i < nels; i++) {
|
||||
data[i] = dist(rng);
|
||||
}
|
||||
ggml_backend_tensor_set(t, data.data(), 0, nels * sizeof(int32_t));
|
||||
} else if (op == GGML_OP_MUL_MAT_ID || op == GGML_OP_ADD_ID) {
|
||||
const int64_t n_expert = (op == GGML_OP_MUL_MAT_ID) ? sources[0].ne[2] : sources[1].ne[1];
|
||||
for (int64_t r = 0; r < ggml_nrows(t); r++) {
|
||||
std::vector<int32_t> data(t->ne[0]);
|
||||
for (int32_t i = 0; i < t->ne[0]; i++) {
|
||||
data[i] = i % n_expert;
|
||||
}
|
||||
std::shuffle(data.begin(), data.end(), rng);
|
||||
ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t));
|
||||
}
|
||||
} else if (op == GGML_OP_SSM_SCAN) {
|
||||
for (int64_t r = 0; r < ggml_nrows(t); r++) {
|
||||
std::vector<int32_t> data(t->ne[0]);
|
||||
for (int32_t i = 0; i < t->ne[0]; i++) {
|
||||
data[i] = i;
|
||||
}
|
||||
std::shuffle(data.begin(), data.end(), rng);
|
||||
ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t));
|
||||
}
|
||||
} else {
|
||||
init_tensor_uniform(t);
|
||||
}
|
||||
} else {
|
||||
init_tensor_uniform(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
enum llm_norm_type {
|
||||
LLM_NORM,
|
||||
@@ -8751,8 +8983,72 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
||||
return test_cases;
|
||||
}
|
||||
|
||||
static std::vector<std::unique_ptr<test_case>> make_test_cases_from_file(const char * path) {
|
||||
std::ifstream f(path);
|
||||
|
||||
if (!f.is_open()) {
|
||||
throw std::runtime_error("Unable to read test file");
|
||||
}
|
||||
|
||||
std::vector<std::unique_ptr<test_case>> test_cases;
|
||||
|
||||
std::string line;
|
||||
|
||||
while (std::getline(f, line)) {
|
||||
std::istringstream iss(line);
|
||||
|
||||
ggml_op op;
|
||||
ggml_type type;
|
||||
std::array<int64_t, 4> ne;
|
||||
std::array<int32_t, GGML_MAX_OP_PARAMS / sizeof(int32_t)> op_params = {};
|
||||
std::string name;
|
||||
uint64_t tmp;
|
||||
|
||||
iss >> tmp;
|
||||
op = (ggml_op)tmp;
|
||||
iss >> tmp;
|
||||
type = (ggml_type)tmp;
|
||||
|
||||
for (size_t i = 0; i < 4; i++) {
|
||||
iss >> ne[i];
|
||||
}
|
||||
|
||||
iss >> tmp;
|
||||
for (size_t i = 0; i < tmp && i < op_params.size(); i++) {
|
||||
iss >> op_params[i];
|
||||
}
|
||||
|
||||
iss >> tmp;
|
||||
|
||||
size_t num_src = std::min((uint64_t)GGML_MAX_SRC, tmp);
|
||||
std::vector<input_tensor> sources(num_src);
|
||||
for (size_t i = 0; i < num_src; i++) {
|
||||
input_tensor& src = sources[i];
|
||||
iss >> tmp;
|
||||
src.type = (ggml_type)tmp;
|
||||
|
||||
for (size_t i = 0; i < 4; i++) {
|
||||
iss >> src.ne[i];
|
||||
}
|
||||
for (size_t i = 0; i < 4; i++) {
|
||||
iss >> src.nb[i];
|
||||
}
|
||||
}
|
||||
|
||||
iss >> name;
|
||||
|
||||
if (name.length() == 1 && name[0] == '-') {
|
||||
name = "";
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_generic_op(op, type, ne, op_params, sources, std::move(name)));
|
||||
}
|
||||
|
||||
return test_cases;
|
||||
}
|
||||
|
||||
static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_names_filter, const char * params_filter,
|
||||
printer * output_printer) {
|
||||
printer * output_printer, const char * test_file_path) {
|
||||
auto filter_test_cases = [](std::vector<std::unique_ptr<test_case>> & test_cases, const char * params_filter) {
|
||||
if (params_filter == nullptr) {
|
||||
return;
|
||||
@@ -8770,9 +9066,26 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
}
|
||||
};
|
||||
|
||||
std::vector<std::unique_ptr<test_case>> test_cases;
|
||||
|
||||
if (test_file_path == nullptr) {
|
||||
switch (mode) {
|
||||
case MODE_TEST:
|
||||
case MODE_GRAD:
|
||||
case MODE_SUPPORT:
|
||||
test_cases = make_test_cases_eval();
|
||||
break;
|
||||
case MODE_PERF:
|
||||
test_cases = make_test_cases_perf();
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
test_cases = make_test_cases_from_file(test_file_path);
|
||||
}
|
||||
|
||||
filter_test_cases(test_cases, params_filter);
|
||||
|
||||
if (mode == MODE_TEST) {
|
||||
auto test_cases = make_test_cases_eval();
|
||||
filter_test_cases(test_cases, params_filter);
|
||||
ggml_backend_t backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL);
|
||||
if (backend_cpu == NULL) {
|
||||
test_operation_info info("", "", "CPU");
|
||||
@@ -8812,8 +9125,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
}
|
||||
|
||||
if (mode == MODE_GRAD) {
|
||||
auto test_cases = make_test_cases_eval();
|
||||
filter_test_cases(test_cases, params_filter);
|
||||
size_t n_ok = 0;
|
||||
for (auto & test : test_cases) {
|
||||
if (test->eval_grad(backend, op_names_filter, output_printer)) {
|
||||
@@ -8826,8 +9137,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
}
|
||||
|
||||
if (mode == MODE_PERF) {
|
||||
auto test_cases = make_test_cases_perf();
|
||||
filter_test_cases(test_cases, params_filter);
|
||||
for (auto & test : test_cases) {
|
||||
test->eval_perf(backend, op_names_filter, output_printer);
|
||||
}
|
||||
@@ -8835,9 +9144,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
}
|
||||
|
||||
if (mode == MODE_SUPPORT) {
|
||||
auto test_cases = make_test_cases_eval();
|
||||
filter_test_cases(test_cases, params_filter);
|
||||
|
||||
// Filter out fusion cases
|
||||
test_cases.erase(
|
||||
std::remove_if(test_cases.begin(), test_cases.end(), [](const std::unique_ptr<test_case> & tc) {
|
||||
@@ -8956,7 +9262,8 @@ static void show_test_coverage() {
|
||||
}
|
||||
|
||||
static void usage(char ** argv) {
|
||||
printf("Usage: %s [mode] [-o <op,..>] [-b <backend>] [-p <params regex>] [--output <console|sql|csv>] [--list-ops] [--show-coverage]\n", argv[0]);
|
||||
printf("Usage: %s [mode] [-o <op,..>] [-b <backend>] [-p <params regex>] [--output <console|sql|csv>] [--list-ops]", argv[0]);
|
||||
printf(" [--show-coverage] [--test-file <path>]\n");
|
||||
printf(" valid modes:\n");
|
||||
printf(" - test (default, compare with CPU backend for correctness)\n");
|
||||
printf(" - grad (compare gradients from backpropagation with method of finite differences)\n");
|
||||
@@ -8967,6 +9274,7 @@ static void usage(char ** argv) {
|
||||
printf(" --output specifies output format (default: console, options: console, sql, csv)\n");
|
||||
printf(" --list-ops lists all available GGML operations\n");
|
||||
printf(" --show-coverage shows test coverage\n");
|
||||
printf(" --test-file reads test operators from a test file generated by llama-export-graph-ops\n");
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
@@ -8975,6 +9283,7 @@ int main(int argc, char ** argv) {
|
||||
const char * op_names_filter = nullptr;
|
||||
const char * backend_filter = nullptr;
|
||||
const char * params_filter = nullptr;
|
||||
const char * test_file_path = nullptr;
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
if (strcmp(argv[i], "test") == 0) {
|
||||
@@ -9022,6 +9331,13 @@ int main(int argc, char ** argv) {
|
||||
} else if (strcmp(argv[i], "--show-coverage") == 0) {
|
||||
show_test_coverage();
|
||||
return 0;
|
||||
} else if (strcmp(argv[i], "--test-file") == 0) {
|
||||
if (i + 1 < argc) {
|
||||
test_file_path = argv[++i];
|
||||
} else {
|
||||
usage(argv);
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
usage(argv);
|
||||
return 1;
|
||||
@@ -9074,7 +9390,7 @@ int main(int argc, char ** argv) {
|
||||
false, "", ggml_backend_dev_description(dev),
|
||||
total / 1024 / 1024, free / 1024 / 1024, true));
|
||||
|
||||
bool ok = test_backend(backend, mode, op_names_filter, params_filter, output_printer.get());
|
||||
bool ok = test_backend(backend, mode, op_names_filter, params_filter, output_printer.get(), test_file_path);
|
||||
|
||||
if (ok) {
|
||||
n_ok++;
|
||||
|
||||
@@ -15,8 +15,12 @@
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
static llama_grammar * build_grammar_with_root(const std::string & grammar_str, const char * grammar_root) {
|
||||
return llama_grammar_init_impl(nullptr, grammar_str.c_str(), grammar_root, false, nullptr, 0, nullptr, 0);
|
||||
}
|
||||
|
||||
static llama_grammar * build_grammar(const std::string & grammar_str) {
|
||||
return llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root", false, nullptr, 0, nullptr, 0);
|
||||
return build_grammar_with_root(grammar_str, "root");
|
||||
}
|
||||
|
||||
static bool test_build_grammar_fails(const std::string & grammar_str) {
|
||||
@@ -860,6 +864,36 @@ static void test_failure_left_recursion() {
|
||||
fprintf(stderr, " ✅︎ Passed\n");
|
||||
}
|
||||
|
||||
static void test_failure_missing_root_symbol() {
|
||||
fprintf(stderr, "⚫ Testing missing root symbol:\n");
|
||||
|
||||
const std::string grammar_str = R"""(
|
||||
root ::= "foobar"
|
||||
)""";
|
||||
|
||||
llama_grammar * failure_result = build_grammar_with_root(grammar_str, "nonexistent");
|
||||
assert(failure_result == nullptr);
|
||||
|
||||
fprintf(stderr, " ✅︎ Passed\n");
|
||||
}
|
||||
|
||||
static void test_custom_root_symbol_check() {
|
||||
fprintf(stderr, "⚫ Testing custom root symbol check:\n");
|
||||
|
||||
const std::string custom_root_grammar_str = R"""(
|
||||
foobar ::= "foobar"
|
||||
)""";
|
||||
|
||||
llama_grammar * failure_result = build_grammar_with_root(custom_root_grammar_str, "root");
|
||||
assert(failure_result == nullptr);
|
||||
|
||||
llama_grammar * success_result = build_grammar_with_root(custom_root_grammar_str, "foobar");
|
||||
assert(success_result != nullptr);
|
||||
llama_grammar_free_impl(success_result);
|
||||
|
||||
fprintf(stderr, " ✅︎ Passed\n");
|
||||
}
|
||||
|
||||
static void test_json_schema() {
|
||||
// Note that this is similar to the regular grammar tests,
|
||||
// but we convert each json schema to a grammar before parsing.
|
||||
@@ -1433,6 +1467,8 @@ int main() {
|
||||
test_failure_missing_root();
|
||||
test_failure_missing_reference();
|
||||
test_failure_left_recursion();
|
||||
test_failure_missing_root_symbol();
|
||||
test_custom_root_symbol_check();
|
||||
test_json_schema();
|
||||
fprintf(stdout, "All tests passed.\n");
|
||||
return 0;
|
||||
|
||||
@@ -11,6 +11,7 @@ sys.path.insert(0, str(path))
|
||||
|
||||
import datetime
|
||||
from utils import *
|
||||
from typing import Literal
|
||||
|
||||
server: ServerProcess
|
||||
|
||||
@@ -23,24 +24,24 @@ def create_server():
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tools", [None, [], [TEST_TOOL]])
|
||||
@pytest.mark.parametrize("template_name,reasoning_budget,expected_end", [
|
||||
("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", None, "<think>\n"),
|
||||
("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", -1, "<think>\n"),
|
||||
("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", 0, "<think>\n</think>"),
|
||||
@pytest.mark.parametrize("template_name,reasoning,expected_end", [
|
||||
("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", "on", "<think>\n"),
|
||||
("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B","auto", "<think>\n"),
|
||||
("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", "off", "<think>\n</think>"),
|
||||
|
||||
("Qwen-Qwen3-0.6B", -1, "<|im_start|>assistant\n"),
|
||||
("Qwen-Qwen3-0.6B", 0, "<|im_start|>assistant\n<think>\n\n</think>\n\n"),
|
||||
("Qwen-Qwen3-0.6B","auto", "<|im_start|>assistant\n"),
|
||||
("Qwen-Qwen3-0.6B", "off", "<|im_start|>assistant\n<think>\n\n</think>\n\n"),
|
||||
|
||||
("Qwen-QwQ-32B", -1, "<|im_start|>assistant\n<think>\n"),
|
||||
("Qwen-QwQ-32B", 0, "<|im_start|>assistant\n<think>\n</think>"),
|
||||
("Qwen-QwQ-32B","auto", "<|im_start|>assistant\n<think>\n"),
|
||||
("Qwen-QwQ-32B", "off", "<|im_start|>assistant\n<think>\n</think>"),
|
||||
|
||||
("CohereForAI-c4ai-command-r7b-12-2024-tool_use", -1, "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"),
|
||||
("CohereForAI-c4ai-command-r7b-12-2024-tool_use", 0, "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|><|START_THINKING|><|END_THINKING|>"),
|
||||
("CohereForAI-c4ai-command-r7b-12-2024-tool_use","auto", "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"),
|
||||
("CohereForAI-c4ai-command-r7b-12-2024-tool_use", "off", "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|><|START_THINKING|><|END_THINKING|>"),
|
||||
])
|
||||
def test_reasoning_budget(template_name: str, reasoning_budget: int | None, expected_end: str, tools: list[dict]):
|
||||
def test_reasoning(template_name: str, reasoning: Literal['on', 'off', 'auto'] | None, expected_end: str, tools: list[dict]):
|
||||
global server
|
||||
server.jinja = True
|
||||
server.reasoning_budget = reasoning_budget
|
||||
server.reasoning = reasoning
|
||||
server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
|
||||
server.start()
|
||||
|
||||
|
||||
@@ -95,7 +95,7 @@ class ServerProcess:
|
||||
no_webui: bool | None = None
|
||||
jinja: bool | None = None
|
||||
reasoning_format: Literal['deepseek', 'none', 'nothink'] | None = None
|
||||
reasoning_budget: int | None = None
|
||||
reasoning: Literal['on', 'off', 'auto'] | None = None
|
||||
chat_template: str | None = None
|
||||
chat_template_file: str | None = None
|
||||
server_path: str | None = None
|
||||
@@ -225,8 +225,8 @@ class ServerProcess:
|
||||
server_args.append("--no-jinja")
|
||||
if self.reasoning_format is not None:
|
||||
server_args.extend(("--reasoning-format", self.reasoning_format))
|
||||
if self.reasoning_budget is not None:
|
||||
server_args.extend(("--reasoning-budget", self.reasoning_budget))
|
||||
if self.reasoning is not None:
|
||||
server_args.extend(("--reasoning", self.reasoning))
|
||||
if self.chat_template:
|
||||
server_args.extend(["--chat-template", self.chat_template])
|
||||
if self.chat_template_file:
|
||||
|
||||
17
vendor/cpp-httplib/httplib.cpp
vendored
17
vendor/cpp-httplib/httplib.cpp
vendored
@@ -4424,7 +4424,8 @@ get_range_offset_and_length(Range r, size_t content_length) {
|
||||
assert(r.first <= r.second &&
|
||||
r.second < static_cast<ssize_t>(content_length));
|
||||
(void)(content_length);
|
||||
return std::make_pair(r.first, static_cast<size_t>(r.second - r.first) + 1);
|
||||
return std::make_pair(static_cast<size_t>(r.first),
|
||||
static_cast<size_t>(r.second - r.first) + 1);
|
||||
}
|
||||
|
||||
std::string make_content_range_header_field(
|
||||
@@ -8616,11 +8617,17 @@ ClientImpl::open_stream(const std::string &method, const std::string &path,
|
||||
handle.body_reader_.stream = handle.stream_;
|
||||
handle.body_reader_.payload_max_length = payload_max_length_;
|
||||
|
||||
auto content_length_str = handle.response->get_header_value("Content-Length");
|
||||
if (!content_length_str.empty()) {
|
||||
if (handle.response->has_header("Content-Length")) {
|
||||
bool is_invalid = false;
|
||||
auto content_length = detail::get_header_value_u64(
|
||||
handle.response->headers, "Content-Length", 0, 0, is_invalid);
|
||||
if (is_invalid) {
|
||||
handle.error = Error::Read;
|
||||
handle.response.reset();
|
||||
return handle;
|
||||
}
|
||||
handle.body_reader_.has_content_length = true;
|
||||
handle.body_reader_.content_length =
|
||||
static_cast<size_t>(std::stoull(content_length_str));
|
||||
handle.body_reader_.content_length = content_length;
|
||||
}
|
||||
|
||||
auto transfer_encoding =
|
||||
|
||||
26
vendor/cpp-httplib/httplib.h
vendored
26
vendor/cpp-httplib/httplib.h
vendored
@@ -8,28 +8,8 @@
|
||||
#ifndef CPPHTTPLIB_HTTPLIB_H
|
||||
#define CPPHTTPLIB_HTTPLIB_H
|
||||
|
||||
#define CPPHTTPLIB_VERSION "0.37.0"
|
||||
#define CPPHTTPLIB_VERSION_NUM "0x002500"
|
||||
|
||||
/*
|
||||
* Platform compatibility check
|
||||
*/
|
||||
|
||||
#if defined(_WIN32) && !defined(_WIN64)
|
||||
#if defined(_MSC_VER)
|
||||
#pragma message( \
|
||||
"cpp-httplib doesn't support 32-bit Windows. Please use a 64-bit compiler.")
|
||||
#else
|
||||
#warning \
|
||||
"cpp-httplib doesn't support 32-bit Windows. Please use a 64-bit compiler."
|
||||
#endif
|
||||
#elif defined(__SIZEOF_POINTER__) && __SIZEOF_POINTER__ < 8
|
||||
#warning \
|
||||
"cpp-httplib doesn't support 32-bit platforms. Please use a 64-bit compiler."
|
||||
#elif defined(__SIZEOF_SIZE_T__) && __SIZEOF_SIZE_T__ < 8
|
||||
#warning \
|
||||
"cpp-httplib doesn't support platforms where size_t is less than 64 bits."
|
||||
#endif
|
||||
#define CPPHTTPLIB_VERSION "0.37.1"
|
||||
#define CPPHTTPLIB_VERSION_NUM "0x002501"
|
||||
|
||||
#ifdef _WIN32
|
||||
#if defined(_WIN32_WINNT) && _WIN32_WINNT < 0x0A00
|
||||
@@ -2797,7 +2777,7 @@ inline size_t get_header_value_u64(const Headers &headers,
|
||||
std::advance(it, static_cast<ssize_t>(id));
|
||||
if (it != rng.second) {
|
||||
if (is_numeric(it->second)) {
|
||||
return std::strtoull(it->second.data(), nullptr, 10);
|
||||
return static_cast<size_t>(std::strtoull(it->second.data(), nullptr, 10));
|
||||
} else {
|
||||
is_invalid_value = true;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user