Compare commits

..

8 Commits
b8180 ... b8188

Author SHA1 Message Date
Masashi Yoshimura
36a7a6589c ggml-webgpu: Support non-contiguous src0 and overlapping src0/src1 in binary ops (#19850)
* ggml-webgpu: Add binary op support for overlapping and non-contiguous.

* Add newline to binary.wgsl

* Append the test of binary op for src overlapping  to test_bin_bcast.

* Remove unnecessary newline.
2026-03-02 07:59:53 -08:00
Ruben Ortlam
feefb92836 vulkan: tune MMVQ for Intel Windows (#19988) 2026-03-02 15:58:25 +01:00
Adrien Gallouët
ec88c3ceea scripts : improve get-wikitext-2.sh (#19952)
* scripts : improve get-wikitext-2.sh

Switch to sh, add curl fallback, and avoid redundant downloads

Signed-off-by: Adrien Gallouët <adrien@gallouet.fr>

* fix indent

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

---------

Signed-off-by: Adrien Gallouët <adrien@gallouet.fr>
Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2026-03-02 15:40:49 +01:00
Aaron Teo
2afcdb9777 ggml-cpu: optimise s390x multiply extend instructions (#20032) 2026-03-02 16:23:56 +08:00
Ruben Ortlam
319146247e vulkan: improve partial offloading performance on AMD (#19976)
* vulkan: fix and enable cpy_tensor_async function

* use transfer_queue for async transfers on AMD, synchronize with timeline semaphore

* update offload_op logic

* fix missing transfer submission

* disable async transfer queue on AMD GCN

* revert op batch size change

* fix cpy_tensor_async checks
2026-03-01 17:32:14 +01:00
oobabooga
66d65ec29b cuda: cap grid.y at 65535 in non-contiguous dequantize/convert kernels (#19999) 2026-03-01 13:40:22 +08:00
Dmitry Atamanov
05728db18e vendors : update miniaudio library to 0.11.24 (#19914) 2026-02-28 16:10:01 +01:00
Adrien Gallouët
4720819d45 vendor : update cpp-httplib to 0.35.0 (#19969)
Signed-off-by: Adrien Gallouët <adrien@gallouet.fr>
2026-02-28 13:53:56 +01:00
13 changed files with 889 additions and 503 deletions

View File

@@ -181,11 +181,11 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
const int16x8_t v_xylso = vec_mulo(v_xls, v_yl);
const int16x8_t v_xylse = vec_mule(v_xls, v_yl);
const int16x8_t v_xyl = vec_meadd(v_xls, v_yl, v_xylso);
const int16x8_t v_xyhso = vec_mulo(v_xhs, v_yh);
const int16x8_t v_xyhse = vec_mule(v_xhs, v_yh);
const int16x8_t v_xyh = vec_meadd(v_xhs, v_yh, v_xyhso);
int16x8_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
int16x8_t v_xy_ = v_xyl + v_xyh; v_xy_ += vec_reve(v_xy_);
const float32x4_t v_xy = vec_float(vec_unpackh(v_xy_));
const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
@@ -890,8 +890,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
const int16x8_t v_minsh = (int16x8_t)vec_unpackh((uint8x16_t)v_mins8);
const int32x4_t v_minso = vec_mulo(v_ysums, v_minsh);
const int32x4_t v_minse = vec_mule(v_ysums, v_minsh);
const int32x4_t v_mins = v_minso + v_minse;
const int32x4_t v_mins = vec_meadd(v_ysums, v_minsh, v_minso);
sumf -= dmin * (v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3]);
const uint8_t * scales = (const uint8_t *)utmp;
@@ -1004,8 +1003,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
const int16x8_t v_minsh = (int16x8_t)vec_unpackh(v_mins8);
const int32x4_t v_minsho = vec_mulo(v_ysums, v_minsh);
const int32x4_t v_minshe = vec_mule(v_ysums, v_minsh);
const int32x4_t v_mins = vec_add(v_minsho, v_minshe);
const int32x4_t v_mins = vec_meadd(v_ysums, v_minsh, v_minsho);
const int32_t mins = vec_hsum_i32x4(v_mins);
const uint8_t * scales = (const uint8_t *)utmp;
@@ -1110,10 +1108,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
const int16x8_t v_scaleh = vec_unpackl(v_scale);
const int32x4_t v_minslo = vec_mulo(v_ysumsl, v_scalel);
const int32x4_t v_minsle = vec_mule(v_ysumsl, v_scalel);
const int32x4_t v_minsl = vec_meadd(v_ysumsl, v_scalel, v_minslo);
const int32x4_t v_minsho = vec_mulo(v_ysumsh, v_scaleh);
const int32x4_t v_minshe = vec_mule(v_ysumsh, v_scaleh);
const int32x4_t v_mins = v_minslo + v_minsle + v_minsho + v_minshe;
const int32x4_t v_minsh = vec_meadd(v_ysumsh, v_scaleh, v_minsho);
const int32x4_t v_mins = vec_add(v_minsl, v_minsh);
const int32_t mins = vec_hsum_i32x4(v_mins);

View File

@@ -16,27 +16,27 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
return;
}
const int64_t i01 = blockIdx.y;
for (int64_t i01 = blockIdx.y; i01 < ne01; i01 += gridDim.y) {
for (int64_t i0203 = blockIdx.z; i0203 < ne0203; i0203 += gridDim.z) {
const uint2 dm = fast_div_modulo((uint32_t)i0203, ne02);
const int64_t i02 = dm.y;
const int64_t i03 = dm.x;
for (int64_t i0203 = blockIdx.z; i0203 < ne0203; i0203 += gridDim.z) {
const uint2 dm = fast_div_modulo((uint32_t)i0203, ne02);
const int64_t i02 = dm.y;
const int64_t i03 = dm.x;
const int64_t ibx0 = i03*s03 + i02*s02 + i01*s01;
const int64_t ibx0 = i03*s03 + i02*s02 + i01*s01;
const int64_t ib = ibx0 + i00/qk; // block index
const int64_t iqs = (i00%qk)/qr; // quant index
const int64_t iybs = i00 - i00%qk; // y block start index
const int64_t y_offset = qr == 1 ? 1 : qk/2;
const int64_t ib = ibx0 + i00/qk; // block index
const int64_t iqs = (i00%qk)/qr; // quant index
const int64_t iybs = i00 - i00%qk; // y block start index
const int64_t y_offset = qr == 1 ? 1 : qk/2;
// dequantize
float2 v;
dequantize_kernel(vx, ib, iqs, v);
// dequantize
float2 v;
dequantize_kernel(vx, ib, iqs, v);
const int64_t iy0 = (i0203*ne01 + i01)*ne00 + iybs + iqs;
y[iy0 + 0] = ggml_cuda_cast<dst_t>(v.x);
y[iy0 + y_offset] = ggml_cuda_cast<dst_t>(v.y);
const int64_t iy0 = (i0203*ne01 + i01)*ne00 + iybs + iqs;
y[iy0 + 0] = ggml_cuda_cast<dst_t>(v.x);
y[iy0 + y_offset] = ggml_cuda_cast<dst_t>(v.y);
}
}
}
@@ -492,7 +492,7 @@ static void dequantize_block_cuda(const void * vx, dst_t * y,
const int64_t s01, const int64_t s02, const int64_t s03, cudaStream_t stream) {
const int64_t ne0203 = ne02*ne03;
const uint3 ne02_fdv = init_fastdiv_values(ne02);
const dim3 num_blocks((ne00 + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE), ne01, (int)std::min(ne0203, (int64_t)65535));
const dim3 num_blocks((ne00 + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE), (int)std::min(ne01, (int64_t)65535), (int)std::min(ne0203, (int64_t)65535));
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>
(vx, y, ne00, ne01, ne0203, ne02_fdv, s01, s02, s03);
}
@@ -628,18 +628,18 @@ static __global__ void convert_unary(
return;
}
const int64_t i01 = blockIdx.y;
const src_t * x = (const src_t *) vx;
for (int64_t i0203 = blockIdx.z; i0203 < ne0203; i0203 += gridDim.z) {
const uint2 dm = fast_div_modulo((uint32_t)i0203, ne02);
const int64_t i02 = dm.y;
const int64_t i03 = dm.x;
for (int64_t i01 = blockIdx.y; i01 < ne01; i01 += gridDim.y) {
for (int64_t i0203 = blockIdx.z; i0203 < ne0203; i0203 += gridDim.z) {
const uint2 dm = fast_div_modulo((uint32_t)i0203, ne02);
const int64_t i02 = dm.y;
const int64_t i03 = dm.x;
const int64_t ix = i03*s03 + i02*s02 + i01*s01 + i00;
const int64_t iy = (i0203*ne01 + i01)*ne00 + i00;
y[iy] = ggml_cuda_cast<dst_t>(x[ix]);
const int64_t ix = i03*s03 + i02*s02 + i01*s01 + i00;
const int64_t iy = (i0203*ne01 + i01)*ne00 + i00;
y[iy] = ggml_cuda_cast<dst_t>(x[ix]);
}
}
}
@@ -649,7 +649,7 @@ static void convert_unary_cuda(const void * vx, dst_t * y,
const int64_t s01, const int64_t s02, const int64_t s03, cudaStream_t stream) {
const int64_t ne0203 = ne02*ne03;
const uint3 ne02_fdv = init_fastdiv_values(ne02);
const dim3 num_blocks((ne00 + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE, ne01, (int)std::min(ne0203, (int64_t)65535));
const dim3 num_blocks((ne00 + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE, (int)std::min(ne01, (int64_t)65535), (int)std::min(ne0203, (int64_t)65535));
convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>
(vx, y, ne00, ne01, ne0203, ne02_fdv, s01, s02, s03);
}

View File

@@ -590,6 +590,7 @@ struct vk_device_struct {
vk_queue transfer_queue;
bool single_queue;
bool support_async;
bool async_use_transfer_queue;
uint32_t subgroup_size;
uint32_t subgroup_size_log2;
uint32_t shader_core_count;
@@ -1858,6 +1859,10 @@ struct ggml_backend_vk_context {
vk_context_ref compute_ctx;
vk_context_ref transfer_ctx;
vk_semaphore transfer_semaphore;
uint64_t transfer_semaphore_last_submitted {};
std::vector<vk_context_ref> tensor_ctxs;
std::vector<vk::DescriptorPool> descriptor_pools;
@@ -1866,6 +1871,7 @@ struct ggml_backend_vk_context {
uint32_t pipeline_descriptor_set_requirements {};
vk_command_pool compute_cmd_pool;
vk_command_pool transfer_cmd_pool;
// number of additional consecutive nodes that are being fused with the
// node currently being processed
@@ -5391,13 +5397,19 @@ static vk_device ggml_vk_get_device(size_t idx) {
ggml_vk_load_shaders(device);
const bool prefers_transfer_queue = device->vendor_id == VK_VENDOR_ID_AMD && device->architecture != AMD_GCN;
if (!device->single_queue) {
const uint32_t transfer_queue_index = compute_queue_family_index == transfer_queue_family_index ? 1 : 0;
ggml_vk_create_queue(device, device->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer }, true);
device->async_use_transfer_queue = prefers_transfer_queue || (getenv("GGML_VK_ASYNC_USE_TRANSFER_QUEUE") != nullptr);
} else {
// TODO: Use pointer or reference to avoid copy
device->transfer_queue.copyFrom(device->compute_queue);
device->transfer_queue.cmd_pool.init(device, &device->transfer_queue);
device->async_use_transfer_queue = false;
}
device->buffer_type = {
@@ -5871,6 +5883,15 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
ctx->almost_ready_fence = ctx->device->device.createFence({});
ctx->compute_cmd_pool.init(ctx->device, &ctx->device->compute_queue);
if (ctx->device->async_use_transfer_queue) {
vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 };
vk::SemaphoreCreateInfo ci{};
ci.setPNext(&tci);
ctx->transfer_semaphore.s = ctx->device->device.createSemaphore(ci);
ctx->transfer_semaphore.value = 0;
ctx->transfer_cmd_pool.init(ctx->device, &ctx->device->transfer_queue);
}
if (vk_perf_logger_enabled) {
ctx->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
@@ -6419,6 +6440,47 @@ static void ggml_vk_ctx_begin(vk_device& device, vk_context& subctx) {
subctx->s = subctx->seqs[subctx->seqs.size() - 1].data();
}
static vk_context ggml_vk_get_compute_ctx(ggml_backend_vk_context * ctx) {
if (!ctx->compute_ctx.expired()) {
return ctx->compute_ctx.lock();
}
vk_context result = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
ctx->compute_ctx = result;
ggml_vk_ctx_begin(ctx->device, result);
if (ctx->device->async_use_transfer_queue && ctx->transfer_semaphore_last_submitted < ctx->transfer_semaphore.value) {
result->s->wait_semaphores.push_back(ctx->transfer_semaphore);
ctx->transfer_semaphore_last_submitted = ctx->transfer_semaphore.value;
}
return result;
}
// Submit any pending transfer queue work and signal the transfer semaphore.
// The next compute context created via ggml_vk_get_compute_ctx will wait on this semaphore.
// Returns true if work was submitted.
static bool ggml_vk_submit_transfer_ctx(ggml_backend_vk_context * ctx) {
if (!ctx->device->async_use_transfer_queue || ctx->transfer_ctx.expired()) {
return false;
}
vk_context cpy_ctx = ctx->transfer_ctx.lock();
ggml_vk_ctx_end(cpy_ctx);
for (auto& cpy : cpy_ctx->in_memcpys) {
memcpy(cpy.dst, cpy.src, cpy.n);
}
ctx->transfer_semaphore.value++;
cpy_ctx->seqs.back().back().signal_semaphores.push_back(ctx->transfer_semaphore);
ggml_vk_submit(cpy_ctx, {});
ctx->transfer_ctx.reset();
return true;
}
static size_t ggml_vk_align_size(size_t width, size_t align) {
VK_LOG_DEBUG("ggml_vk_align_size(" << width << ", " << align << ")");
return CEIL_DIV(width, align) * align;
@@ -7512,6 +7574,18 @@ static bool ggml_vk_should_use_mmvq(const vk_device& device, uint32_t m, uint32_
return false;
}
if (device->driver_id == vk::DriverId::eIntelProprietaryWindows) {
// Intel Windows proprietary driver tuning
switch (src0_type) {
case GGML_TYPE_MXFP4:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
return false;
default:
return true;
}
}
switch (src0_type) {
// From tests on A770 Linux, may need more tuning
case GGML_TYPE_Q4_0:
@@ -12529,15 +12603,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
}
}
vk_context compute_ctx;
if (ctx->compute_ctx.expired()) {
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
ctx->compute_ctx = compute_ctx;
ggml_vk_ctx_begin(ctx->device, compute_ctx);
} else {
compute_ctx = ctx->compute_ctx.lock();
}
vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx);
{
// This logic detects dependencies between modes in the graph and calls ggml_vk_sync_buffers
@@ -13055,6 +13121,9 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
ctx->prealloc_x_need_sync = ctx->prealloc_y_need_sync = ctx->prealloc_split_k_need_sync = false;
ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool);
if (ctx->device->async_use_transfer_queue) {
ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool);
}
for (size_t i = 0; i < ctx->gc.semaphores.size(); i++) {
ctx->device->device.destroySemaphore({ ctx->gc.semaphores[i].s });
@@ -13116,6 +13185,11 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
ctx->descriptor_sets.clear();
ctx->compute_cmd_pool.destroy(ctx->device->device);
if (ctx->device->async_use_transfer_queue) {
ctx->device->device.destroySemaphore(ctx->transfer_semaphore.s);
ctx->transfer_cmd_pool.destroy(ctx->device->device);
}
if (vk_perf_logger_enabled) {
ctx->perf_logger->print_timings(true);
}
@@ -13387,34 +13461,38 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;
vk_context compute_ctx;
vk_context cpy_ctx;
if (ctx->compute_ctx.expired()) {
// Initialize new transfer context
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
ctx->compute_ctx = compute_ctx;
ggml_vk_ctx_begin(ctx->device, compute_ctx);
if (ctx->device->async_use_transfer_queue) {
if (ctx->transfer_ctx.expired()) {
// Initialize new transfer context
cpy_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
ctx->transfer_ctx = cpy_ctx;
ggml_vk_ctx_begin(ctx->device, cpy_ctx);
} else {
cpy_ctx = ctx->transfer_ctx.lock();
}
} else {
compute_ctx = ctx->compute_ctx.lock();
cpy_ctx = ggml_vk_get_compute_ctx(ctx);
}
vk_buffer buf = buf_ctx->dev_buffer;
auto dst_offset = vk_tensor_offset(tensor) + tensor->view_offs + offset;
bool ret = ggml_vk_buffer_write_async(compute_ctx, buf, dst_offset, data, size);
bool ret = ggml_vk_buffer_write_async(cpy_ctx, buf, dst_offset, data, size);
if (!ret) {
ggml_vk_ensure_sync_staging_buffer(ctx, size);
ggml_vk_sync_buffers(nullptr, compute_ctx);
ggml_vk_sync_buffers(nullptr, cpy_ctx);
vk::BufferCopy buffer_cpy;
buffer_cpy.srcOffset = 0;
buffer_cpy.dstOffset = dst_offset;
buffer_cpy.size = size;
compute_ctx->s->buffer.copyBuffer(ctx->sync_staging->buffer, buf->buffer, { buffer_cpy });
deferred_memcpy(ctx->sync_staging->ptr, data, size, &compute_ctx->in_memcpys);
cpy_ctx->s->buffer.copyBuffer(ctx->sync_staging->buffer, buf->buffer, { buffer_cpy });
deferred_memcpy(ctx->sync_staging->ptr, data, size, &cpy_ctx->in_memcpys);
ggml_vk_synchronize(ctx);
}
}
@@ -13426,16 +13504,7 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;
vk_context compute_ctx;
if (ctx->compute_ctx.expired()) {
// Initialize new transfer context
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
ctx->compute_ctx = compute_ctx;
ggml_vk_ctx_begin(ctx->device, compute_ctx);
} else {
compute_ctx = ctx->compute_ctx.lock();
}
vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx);
vk_buffer buf = buf_ctx->dev_buffer;
@@ -13458,31 +13527,60 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_
}
}
static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async()");
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
if ((dst->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) {
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend_dst->context;
if (dst->buffer->buft != ggml_backend_vk_get_default_buffer_type(backend_dst)) {
return false;
}
ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
vk_buffer dst_buf = dst_buf_ctx->dev_buffer;
if (ggml_backend_buffer_is_vk(src->buffer)) {
ggml_backend_vk_buffer_context * src_buf_ctx = (ggml_backend_vk_buffer_context *)src->buffer->context;
ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
vk_context compute_ctx;
if (ctx->compute_ctx.expired()) {
// Initialize new transfer context
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
ctx->compute_ctx = compute_ctx;
ggml_vk_ctx_begin(ctx->device, compute_ctx);
} else {
compute_ctx = ctx->compute_ctx.lock();
// Async copy only works within the same device
if (src_buf_ctx->dev_buffer->device != dst_buf->device) {
return false;
}
vk_buffer src_buf = src_buf_ctx->dev_buffer;
vk_buffer dst_buf = dst_buf_ctx->dev_buffer;
vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx);
ggml_vk_buffer_copy_async(compute_ctx, dst_buf, vk_tensor_offset(dst) + dst->view_offs, src_buf, vk_tensor_offset(src) + src->view_offs, ggml_nbytes(src));
ggml_vk_buffer_copy_async(compute_ctx, dst_buf, vk_tensor_offset(dst) + dst->view_offs,
src_buf_ctx->dev_buffer, vk_tensor_offset(src) + src->view_offs,
ggml_nbytes(src));
return true;
}
if (ggml_backend_buffer_is_host(src->buffer)) {
vk_buffer pinned_buf = nullptr;
size_t pinned_offset = 0;
ggml_vk_host_get(ctx->device, src->data, pinned_buf, pinned_offset);
if (pinned_buf == nullptr) {
return false;
}
vk_context cpy_ctx;
if (ctx->device->async_use_transfer_queue) {
if (ctx->transfer_ctx.expired()) {
cpy_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
ctx->transfer_ctx = cpy_ctx;
ggml_vk_ctx_begin(ctx->device, cpy_ctx);
} else {
cpy_ctx = ctx->transfer_ctx.lock();
}
} else {
cpy_ctx = ggml_vk_get_compute_ctx(ctx);
}
return ggml_vk_buffer_write_async(cpy_ctx, dst_buf,
vk_tensor_offset(dst) + dst->view_offs,
src->data, ggml_nbytes(src));
}
GGML_UNUSED(backend_src);
return false;
}
@@ -13491,6 +13589,10 @@ static void ggml_vk_synchronize(ggml_backend_vk_context * ctx) {
bool do_transfer = !ctx->compute_ctx.expired();
if (ggml_vk_submit_transfer_ctx(ctx)) {
ctx->submit_pending = true;
}
vk_context compute_ctx;
if (do_transfer) {
compute_ctx = ctx->compute_ctx.lock();
@@ -13506,7 +13608,22 @@ static void ggml_vk_synchronize(ggml_backend_vk_context * ctx) {
}
if (ctx->submit_pending) {
{
if (ctx->device->async_use_transfer_queue && ctx->transfer_semaphore_last_submitted < ctx->transfer_semaphore.value) {
vk::TimelineSemaphoreSubmitInfo tl_info{
1, &ctx->transfer_semaphore.value,
0, nullptr,
};
vk::PipelineStageFlags stage = ctx->device->transfer_queue.stage_flags;
vk::SubmitInfo si{
1, &ctx->transfer_semaphore.s, &stage,
0, nullptr,
0, nullptr,
};
si.setPNext(&tl_info);
std::lock_guard<std::mutex> guard(queue_mutex);
ctx->device->compute_queue.queue.submit({ si }, ctx->fence);
ctx->transfer_semaphore_last_submitted = ctx->transfer_semaphore.value;
} else {
std::lock_guard<std::mutex> guard(queue_mutex);
ctx->device->compute_queue.queue.submit({}, ctx->fence);
}
@@ -13972,6 +14089,8 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
bool first_node_in_batch = true; // true if next node will be first node in a batch
int submit_node_idx = 0; // index to first node in a batch
ggml_vk_submit_transfer_ctx(ctx);
vk_context compute_ctx;
if (vk_perf_logger_enabled) {
// allocate/resize the query pool
@@ -13997,9 +14116,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
std::fill(ctx->query_node_idx.begin(), ctx->query_node_idx.end(), 0);
GGML_ASSERT(ctx->compute_ctx.expired());
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
ctx->compute_ctx = compute_ctx;
ggml_vk_ctx_begin(ctx->device, compute_ctx);
compute_ctx = ggml_vk_get_compute_ctx(ctx);
ctx->query_idx = 0;
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
}
@@ -14009,13 +14126,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
if (ctx->prealloc_size_add_rms_partials) {
ggml_vk_preallocate_buffers(ctx, nullptr);
if (ctx->compute_ctx.expired()) {
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
ctx->compute_ctx = compute_ctx;
ggml_vk_ctx_begin(ctx->device, compute_ctx);
} else {
compute_ctx = ctx->compute_ctx.lock();
}
compute_ctx = ggml_vk_get_compute_ctx(ctx);
// initialize partial sums to zero.
ggml_vk_buffer_memset_async(compute_ctx, ctx->prealloc_add_rms_partials, 0, 0, ctx->prealloc_size_add_rms_partials);
ggml_vk_sync_buffers(ctx, compute_ctx);
@@ -14238,13 +14349,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
bool enqueued = ggml_vk_build_graph(ctx, cgraph, i, cgraph->nodes[submit_node_idx], submit_node_idx, i + ctx->num_additional_fused_ops >= last_node, almost_ready, submit);
if (vk_perf_logger_enabled && enqueued) {
if (ctx->compute_ctx.expired()) {
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
ctx->compute_ctx = compute_ctx;
ggml_vk_ctx_begin(ctx->device, compute_ctx);
} else {
compute_ctx = ctx->compute_ctx.lock();
}
compute_ctx = ggml_vk_get_compute_ctx(ctx);
if (!vk_perf_logger_concurrent) {
// track a single node/fusion for the current query
ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i];
@@ -14579,16 +14684,9 @@ static void ggml_backend_vk_event_record(ggml_backend_t backend, ggml_backend_ev
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
vk_event *vkev = (vk_event *)event->context;
vk_context compute_ctx;
ggml_vk_submit_transfer_ctx(ctx);
if (ctx->compute_ctx.expired()) {
// Initialize new transfer context
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
ctx->compute_ctx = compute_ctx;
ggml_vk_ctx_begin(ctx->device, compute_ctx);
} else {
compute_ctx = ctx->compute_ctx.lock();
}
vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx);
// the backend interface doesn't have an explicit reset, so reset it here
// before we record the command to set it
@@ -14609,16 +14707,7 @@ static void ggml_backend_vk_event_wait(ggml_backend_t backend, ggml_backend_even
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
vk_event *vkev = (vk_event *)event->context;
vk_context compute_ctx;
if (ctx->compute_ctx.expired()) {
// Initialize new transfer context
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
ctx->compute_ctx = compute_ctx;
ggml_vk_ctx_begin(ctx->device, compute_ctx);
} else {
compute_ctx = ctx->compute_ctx.lock();
}
vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx);
ggml_vk_wait_events(compute_ctx, {vkev->event});
ggml_vk_ctx_end(compute_ctx);
@@ -14631,7 +14720,7 @@ static ggml_backend_i ggml_backend_vk_interface = {
/* .free = */ ggml_backend_vk_free,
/* .set_tensor_async = */ ggml_backend_vk_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_vk_get_tensor_async,
/* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async,
/* .cpy_tensor_async = */ ggml_backend_vk_cpy_tensor_async,
/* .synchronize = */ ggml_backend_vk_synchronize,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,
@@ -15367,11 +15456,25 @@ static bool ggml_backend_vk_device_supports_buft(ggml_backend_dev_t dev, ggml_ba
return buft_ctx->device->idx == ctx->device;
}
static int64_t ggml_vk_get_op_batch_size(const ggml_tensor * op) {
switch (op->op) {
case GGML_OP_GET_ROWS:
return 0;
case GGML_OP_MUL_MAT:
return op->ne[1];
case GGML_OP_MUL_MAT_ID:
case GGML_OP_ROPE:
case GGML_OP_ROPE_BACK:
return op->ne[2];
default:
return ggml_nrows(op);
}
}
static bool ggml_backend_vk_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
ggml_backend_vk_device_context * dev_ctx = (ggml_backend_vk_device_context *)dev->context;
return (op->ne[1] >= dev_ctx->op_offload_min_batch_size && op->op != GGML_OP_GET_ROWS) ||
(op->ne[2] >= dev_ctx->op_offload_min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
return ggml_vk_get_op_batch_size(op) >= dev_ctx->op_offload_min_batch_size;
}
static ggml_backend_event_t ggml_backend_vk_device_event_new(ggml_backend_dev_t dev) {

View File

@@ -68,6 +68,7 @@ struct ggml_webgpu_shader_lib_context {
size_t wg_mem_limit_bytes = 0;
bool inplace = false;
bool overlap = false;
bool src_overlap = false;
bool supports_subgroup_matrix = false;
uint32_t sg_mat_m = 0;
uint32_t sg_mat_n = 0;
@@ -179,9 +180,10 @@ struct ggml_webgpu_binary_pipeline_key {
int op;
bool inplace;
bool overlap;
bool src_overlap;
bool operator==(const ggml_webgpu_binary_pipeline_key & other) const {
return type == other.type && op == other.op && inplace == other.inplace && overlap == other.overlap;
return type == other.type && op == other.op && inplace == other.inplace && overlap == other.overlap && src_overlap == other.src_overlap;
}
};
@@ -192,6 +194,7 @@ struct ggml_webgpu_binary_pipeline_key_hash {
ggml_webgpu_hash_combine(seed, key.op);
ggml_webgpu_hash_combine(seed, key.inplace);
ggml_webgpu_hash_combine(seed, key.overlap);
ggml_webgpu_hash_combine(seed, key.src_overlap);
return seed;
}
};
@@ -1044,6 +1047,7 @@ class ggml_webgpu_shader_lib {
.op = context.dst->op,
.inplace = context.inplace,
.overlap = context.overlap,
.src_overlap = context.src_overlap,
};
auto it = binary_pipelines.find(key);
@@ -1076,6 +1080,9 @@ class ggml_webgpu_shader_lib {
} else if (key.overlap) {
defines.push_back("OVERLAP");
variant += "_overlap";
} else if (key.src_overlap) {
defines.push_back("SRC_OVERLAP");
variant += "_src_overlap";
}
defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));

View File

@@ -788,6 +788,7 @@ static bool ggml_webgpu_tensor_overlap(ggml_tensor * a, ggml_tensor * b) {
struct binary_overlap_flags {
bool inplace; // src0 == dst
bool overlap; // src1 == dst
bool src_overlap;
};
static binary_overlap_flags ggml_webgpu_detect_binary_overlap(ggml_tensor * src0,
@@ -796,6 +797,7 @@ static binary_overlap_flags ggml_webgpu_detect_binary_overlap(ggml_tensor * src0
binary_overlap_flags flags = {};
flags.inplace = ggml_webgpu_tensor_equal(src0, dst);
flags.overlap = ggml_webgpu_tensor_overlap(src1, dst);
flags.src_overlap = ggml_webgpu_tensor_overlap(src0, src1);
return flags;
}
@@ -1353,6 +1355,7 @@ static webgpu_command ggml_webgpu_binary_op(webgpu_context & ctx,
.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup,
.inplace = flags.inplace,
.overlap = flags.overlap,
.src_overlap = flags.src_overlap,
};
webgpu_pipeline pipeline = ctx->shader_lib->get_binary_pipeline(shader_lib_ctx);
@@ -1361,11 +1364,28 @@ static webgpu_command ggml_webgpu_binary_op(webgpu_context & ctx,
uint32_t ne = (uint32_t) ggml_nelements(dst);
size_t src0_webgpu_tensor_align_offset = ggml_webgpu_tensor_align_offset(ctx, src0);
size_t src1_webgpu_tensor_align_offset = ggml_webgpu_tensor_align_offset(ctx, src1);
uint32_t offset_merged_src0 = 0;
uint32_t offset_merged_src1 = 0;
if (flags.src_overlap) {
size_t min_off = std::min(src0_webgpu_tensor_align_offset, src1_webgpu_tensor_align_offset);
offset_merged_src0 = (uint32_t) ((src0_webgpu_tensor_align_offset - min_off) / ggml_type_size(src0->type));
offset_merged_src1 = (uint32_t) ((src1_webgpu_tensor_align_offset - min_off) / ggml_type_size(src0->type));
}
std::vector<uint32_t> params = {
ne,
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
offset_merged_src0,
offset_merged_src1,
(uint32_t) (src0->nb[0] / ggml_type_size(src0->type)),
(uint32_t) (src0->nb[1] / ggml_type_size(src0->type)),
(uint32_t) (src0->nb[2] / ggml_type_size(src0->type)),
(uint32_t) (src0->nb[3] / ggml_type_size(src0->type)),
(uint32_t) (src1->nb[0] / ggml_type_size(src1->type)),
(uint32_t) (src1->nb[1] / ggml_type_size(src1->type)),
(uint32_t) (src1->nb[2] / ggml_type_size(src1->type)),
@@ -1381,25 +1401,43 @@ static webgpu_command ggml_webgpu_binary_op(webgpu_context & ctx,
std::vector<wgpu::BindGroupEntry> entries;
entries.push_back({
.binding = 0,
.buffer = ggml_webgpu_tensor_buf(src0),
.offset = ggml_webgpu_tensor_align_offset(ctx, src0),
.size = ggml_webgpu_tensor_binding_size(ctx, src0),
});
entries.push_back({
.binding = 1,
.buffer = ggml_webgpu_tensor_buf(src1),
.offset = ggml_webgpu_tensor_align_offset(ctx, src1),
.size = ggml_webgpu_tensor_binding_size(ctx, src1),
});
if (!flags.inplace && !flags.overlap) {
entries.push_back({ .binding = 2,
.buffer = ggml_webgpu_tensor_buf(dst),
.offset = ggml_webgpu_tensor_align_offset(ctx, dst),
.size = ggml_webgpu_tensor_binding_size(ctx, dst) });
if (flags.src_overlap) {
size_t merged_offset = std::min(src0_webgpu_tensor_align_offset, src1_webgpu_tensor_align_offset);
size_t merged_end = std::max(src0_webgpu_tensor_align_offset + ggml_webgpu_tensor_binding_size(ctx, src0),
src1_webgpu_tensor_align_offset + ggml_webgpu_tensor_binding_size(ctx, src1));
entries.push_back({
.binding = 0,
.buffer = ggml_webgpu_tensor_buf(src0),
.offset = merged_offset,
.size = merged_end - merged_offset,
});
entries.push_back({
.binding = 1,
.buffer = ggml_webgpu_tensor_buf(dst),
.offset = ggml_webgpu_tensor_align_offset(ctx, dst),
.size = ggml_webgpu_tensor_binding_size(ctx, dst),
});
} else {
entries.push_back({
.binding = 0,
.buffer = ggml_webgpu_tensor_buf(src0),
.offset = src0_webgpu_tensor_align_offset,
.size = ggml_webgpu_tensor_binding_size(ctx, src0),
});
entries.push_back({
.binding = 1,
.buffer = ggml_webgpu_tensor_buf(src1),
.offset = src1_webgpu_tensor_align_offset,
.size = ggml_webgpu_tensor_binding_size(ctx, src1),
});
if (!flags.inplace && !flags.overlap) {
entries.push_back({
.binding = 2,
.buffer = ggml_webgpu_tensor_buf(dst),
.offset = ggml_webgpu_tensor_align_offset(ctx, dst),
.size = ggml_webgpu_tensor_binding_size(ctx, dst),
});
}
}
uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size);
@@ -2816,10 +2854,8 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
case GGML_OP_SUB:
case GGML_OP_MUL:
case GGML_OP_DIV:
// TODO: support non-contiguous tensors, e.g. for MOE_EXPERT_REDUCE
// see https://github.com/ggml-org/llama.cpp/pull/16857
supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && (src0->type == op->type) &&
(src1->type == op->type) && ggml_is_contiguous(src0) && ggml_is_contiguous(src1);
(src1->type == op->type);
break;
case GGML_OP_CPY:
case GGML_OP_CONT:

View File

@@ -7,6 +7,13 @@ struct Params {
offset_src0: u32,
offset_src1: u32,
offset_dst: u32,
offset_merged_src0: u32,
offset_merged_src1: u32,
stride_src0_0: u32,
stride_src0_1: u32,
stride_src0_2: u32,
stride_src0_3: u32,
stride_src1_0: u32,
stride_src1_1: u32,
@@ -23,6 +30,21 @@ struct Params {
b_ne3: u32,
};
fn src0_index(_i: u32) -> u32 {
var i = _i;
let a_i3 = i / (params.a_ne2 * params.a_ne1 * params.a_ne0);
i = i % (params.a_ne2 * params.a_ne1 * params.a_ne0);
let a_i2 = i / (params.a_ne1 * params.a_ne0);
i = i % (params.a_ne1 * params.a_ne0);
let a_i1 = i / params.a_ne0;
let a_i0 = i % params.a_ne0;
return a_i0 * params.stride_src0_0 +
a_i1 * params.stride_src0_1 +
a_i2 * params.stride_src0_2 +
a_i3 * params.stride_src0_3;
}
fn src1_index(_i: u32) -> u32 {
var i = _i;
let a_i3 = i / (params.a_ne2 * params.a_ne1 * params.a_ne0);
@@ -53,17 +75,22 @@ fn src1_index(_i: u32) -> u32 {
#define DataType f16
#endif
#ifdef SRC_OVERLAP
@group(0) @binding(0)
var<storage, read_write> merged_src: array<DataType>;
@group(0) @binding(1)
var<storage, read_write> dst: array<DataType>;
@group(0) @binding(2)
var<uniform> params: Params;
#else
@group(0) @binding(0)
var<storage, read_write> src0: array<DataType>;
@group(0) @binding(1)
var<storage, read_write> src1 : array<DataType>;
#ifdef INPLACE
@group(0) @binding(2)
var<uniform> params: Params;
#elif defined(OVERLAP)
#if defined(INPLACE) || defined(OVERLAP)
@group(0) @binding(2)
var<uniform> params: Params;
@@ -74,6 +101,7 @@ var<storage, read_write> dst: array<DataType>;
@group(0) @binding(3)
var<uniform> params: Params;
#endif
#endif
fn op(a: DataType, b: DataType) -> DataType {
#ifdef OP_ADD
@@ -87,13 +115,17 @@ fn op(a: DataType, b: DataType) -> DataType {
#endif
}
fn update(dst_i: u32, src0_i: u32, src1_i: u32){
fn update(dst_i: u32, src0_i: u32, src1_i: u32) {
#ifdef SRC_OVERLAP
let result = op(merged_src[src0_i], merged_src[src1_i]);
#else
let result = op(src0[src0_i], src1[src1_i]);
#endif
#ifdef INPLACE
src0[dst_i] = result;
src0[src0_i] = result;
#elif defined(OVERLAP)
src1[dst_i] = result;
src1[src1_i] = result;
#else
dst[dst_i] = result;
#endif
@@ -102,6 +134,8 @@ fn update(dst_i: u32, src0_i: u32, src1_i: u32){
@compute @workgroup_size(WG_SIZE)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
if (gid.x < params.ne) {
update(params.offset_dst + gid.x, params.offset_src0 + gid.x, params.offset_src1 + src1_index(gid.x));
let src0_i = params.offset_src0 + params.offset_merged_src0 + src0_index(gid.x);
let src1_i = params.offset_src1 + params.offset_merged_src1 + src1_index(gid.x);
update(params.offset_dst + gid.x, src0_i, src1_i);
}
}

View File

@@ -1,11 +1,43 @@
#!/usr/bin/env bash
#!/bin/sh
# vim: set ts=4 sw=4 et:
wget https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
unzip wikitext-2-raw-v1.zip
ZIP="wikitext-2-raw-v1.zip"
FILE="wikitext-2-raw/wiki.test.raw"
URL="https://huggingface.co/datasets/ggml-org/ci/resolve/main/$ZIP"
echo "Usage:"
echo ""
echo " ./llama-perplexity -m model.gguf -f wikitext-2-raw/wiki.test.raw [other params]"
echo ""
die() {
printf "%s\n" "$@" >&2
exit 1
}
exit 0
have_cmd() {
for cmd; do
command -v "$cmd" >/dev/null || return
done
}
dl() {
[ -f "$2" ] && return
if have_cmd wget; then
wget "$1" -O "$2"
elif have_cmd curl; then
curl -L "$1" -o "$2"
else
die "Please install wget or curl"
fi
}
have_cmd unzip || die "Please install unzip"
if [ ! -f "$FILE" ]; then
dl "$URL" "$ZIP" || exit
unzip -o "$ZIP" || exit
rm -f -- "$ZIP"
fi
cat <<EOF
Usage:
llama-perplexity -m model.gguf -f $FILE [other params]
EOF

View File

@@ -5,7 +5,7 @@ import os
import sys
import subprocess
HTTPLIB_VERSION = "refs/tags/v0.34.0"
HTTPLIB_VERSION = "refs/tags/v0.35.0"
vendor = {
"https://github.com/nlohmann/json/releases/latest/download/json.hpp": "vendor/nlohmann/json.hpp",
@@ -14,8 +14,8 @@ vendor = {
"https://raw.githubusercontent.com/nothings/stb/refs/heads/master/stb_image.h": "vendor/stb/stb_image.h",
# not using latest tag to avoid this issue: https://github.com/ggml-org/llama.cpp/pull/17179#discussion_r2515877926
# "https://github.com/mackron/miniaudio/raw/refs/tags/0.11.23/miniaudio.h": "vendor/miniaudio/miniaudio.h",
"https://github.com/mackron/miniaudio/raw/669ed3e844524fcd883231b13095baee9f6de304/miniaudio.h": "vendor/miniaudio/miniaudio.h",
# "https://github.com/mackron/miniaudio/raw/refs/tags/0.11.24/miniaudio.h": "vendor/miniaudio/miniaudio.h",
"https://github.com/mackron/miniaudio/raw/13d161bc8d856ad61ae46b798bbeffc0f49808e8/miniaudio.h": "vendor/miniaudio/miniaudio.h",
f"https://raw.githubusercontent.com/yhirose/cpp-httplib/{HTTPLIB_VERSION}/httplib.h": "httplib.h",
f"https://raw.githubusercontent.com/yhirose/cpp-httplib/{HTTPLIB_VERSION}/split.py": "split.py",

View File

@@ -2977,6 +2977,7 @@ struct test_bin_bcast : public test_case {
const std::array<int, 4> nr;
int nf; // number of fused ops, nf == 1 -> single op (no fusion)
bool perm1; // permute src1?
bool src_overlap; // src0 and src1 are overlapping views of the same buffer
bool run_whole_graph() override { return nf > 1; }
@@ -2992,8 +2993,8 @@ struct test_bin_bcast : public test_case {
std::array<int64_t, 4> ne = {10, 10, 1, 1},
std::array<int, 4> nr = {1, 2, 1, 1},
int nf = 1,
bool perm1 = false)
: op(op), type(type), ne(ne), nr(nr), nf(nf), perm1(perm1) {}
bool perm1 = false, bool src_overlap = false)
: op(op), type(type), ne(ne), nr(nr), nf(nf), perm1(perm1), src_overlap(src_overlap) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
GGML_ASSERT(nf <= 16);
@@ -3008,6 +3009,8 @@ struct test_bin_bcast : public test_case {
b[i] = ggml_new_tensor_4d(ctx, type, ne[p[0]], ne[p[1]], ne[p[2]], ne[p[3]]);
b[i] = ggml_permute(ctx, b[i], p[0], p[1], p[2], p[3]);
} else if (src_overlap) {
b[i] = ggml_view_4d(ctx, a, ne[0], ne[1], ne[2], 2 * (ne[3] / 3), a->nb[1], a->nb[2], a->nb[3], (ne[3] / 3) * a->nb[3]);
} else {
b[i] = ggml_new_tensor(ctx, type, 4, ne.data());
}
@@ -3021,7 +3024,13 @@ struct test_bin_bcast : public test_case {
ggml_set_param(b[0]);
}
ggml_tensor * out = a;
ggml_tensor *out;
if (src_overlap) {
out = ggml_view_4d(ctx, a, ne[0], ne[1], ne[2], 2 * (ne[3] / 3), a->nb[1], a->nb[2], a->nb[3], 0);
} else {
out = a;
}
for (int i = 0; i < nf; ++i) {
out = op(ctx, out, b[i]);
@@ -7527,9 +7536,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
}
}
auto add_test_bin_bcast = [&](ggml_type type, std::array<int64_t, 4> ne, std::array<int, 4> nr, bool perm1 = false) {
auto add_test_bin_bcast = [&](ggml_type type, std::array<int64_t, 4> ne, std::array<int, 4> nr, bool perm1 = false, bool src_overlap = false) {
for (auto op : {ggml_add, ggml_sub, ggml_mul, ggml_div}) {
test_cases.emplace_back(new test_bin_bcast(op, type, ne, nr, 1, perm1));
test_cases.emplace_back(new test_bin_bcast(op, type, ne, nr, 1, perm1, src_overlap));
}
};
for (ggml_type type : {GGML_TYPE_F16, GGML_TYPE_F32}) {
@@ -7549,6 +7558,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
add_test_bin_bcast(type, {10, 5, 4, 3}, {2, 2, 2, 2}, perm1);
}
// src_overlap
add_test_bin_bcast(type, {10, 5, 4, 6}, {1, 1, 1, 1}, false, true);
add_test_bin_bcast(type, {10, 5, 4, 5}, {1, 1, 1, 1}, false, true);
add_test_bin_bcast(type, {1, 1, 120, 120}, {1, 1, 1, 1}, false, true);
add_test_bin_bcast(type, {1, 1, 4, 320}, {1, 1, 1, 1}, false, true);
// test case for k_bin_bcast_unravel in CUDA backend
add_test_bin_bcast(type, {1, 1, 65536, 1}, {256, 1, 1, 1});

View File

@@ -171,7 +171,6 @@ endif()
if (CPPHTTPLIB_OPENSSL_SUPPORT)
target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_OPENSSL_SUPPORT) # used in server.cpp
if (APPLE AND CMAKE_SYSTEM_NAME STREQUAL "Darwin")
target_compile_definitions(${TARGET} PRIVATE CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN)
find_library(CORE_FOUNDATION_FRAMEWORK CoreFoundation REQUIRED)
find_library(SECURITY_FRAMEWORK Security REQUIRED)
target_link_libraries(${TARGET} PUBLIC ${CORE_FOUNDATION_FRAMEWORK} ${SECURITY_FRAMEWORK})

View File

@@ -2571,10 +2571,46 @@ find_content_type(const std::string &path,
}
}
std::string
extract_media_type(const std::string &content_type,
std::map<std::string, std::string> *params = nullptr) {
// Extract type/subtype from Content-Type value (RFC 2045)
// e.g. "application/json; charset=utf-8" -> "application/json"
auto media_type = content_type;
auto semicolon_pos = media_type.find(';');
if (semicolon_pos != std::string::npos) {
auto param_str = media_type.substr(semicolon_pos + 1);
media_type = media_type.substr(0, semicolon_pos);
if (params) {
// Parse parameters: key=value pairs separated by ';'
split(param_str.data(), param_str.data() + param_str.size(), ';',
[&](const char *b, const char *e) {
std::string key;
std::string val;
split(b, e, '=', [&](const char *b2, const char *e2) {
if (key.empty()) {
key.assign(b2, e2);
} else {
val.assign(b2, e2);
}
});
if (!key.empty()) {
params->emplace(trim_copy(key), trim_double_quotes_copy(val));
}
});
}
}
// Trim whitespace from media type
return trim_copy(media_type);
}
bool can_compress_content_type(const std::string &content_type) {
using udl::operator""_t;
auto tag = str2tag(content_type);
auto mime_type = extract_media_type(content_type);
auto tag = str2tag(mime_type);
switch (tag) {
case "image/svg+xml"_t:
@@ -2586,7 +2622,7 @@ bool can_compress_content_type(const std::string &content_type) {
case "text/event-stream"_t: return false;
default: return !content_type.rfind("text/", 0);
default: return !mime_type.rfind("text/", 0);
}
}
@@ -3141,7 +3177,8 @@ bool is_chunked_transfer_encoding(const Headers &headers) {
template <typename T, typename U>
bool prepare_content_receiver(T &x, int &status,
ContentReceiverWithProgress receiver,
bool decompress, U callback) {
bool decompress, size_t payload_max_length,
bool &exceed_payload_max_length, U callback) {
if (decompress) {
std::string encoding = x.get_header_value("Content-Encoding");
std::unique_ptr<decompressor> decompressor;
@@ -3157,12 +3194,22 @@ bool prepare_content_receiver(T &x, int &status,
if (decompressor) {
if (decompressor->is_valid()) {
size_t decompressed_size = 0;
ContentReceiverWithProgress out = [&](const char *buf, size_t n,
size_t off, size_t len) {
return decompressor->decompress(buf, n,
[&](const char *buf2, size_t n2) {
return receiver(buf2, n2, off, len);
});
return decompressor->decompress(
buf, n, [&](const char *buf2, size_t n2) {
// Guard against zip-bomb: check
// decompressed size against limit.
if (payload_max_length > 0 &&
(decompressed_size >= payload_max_length ||
n2 > payload_max_length - decompressed_size)) {
exceed_payload_max_length = true;
return false;
}
decompressed_size += n2;
return receiver(buf2, n2, off, len);
});
};
return callback(std::move(out));
} else {
@@ -3183,11 +3230,14 @@ template <typename T>
bool read_content(Stream &strm, T &x, size_t payload_max_length, int &status,
DownloadProgress progress,
ContentReceiverWithProgress receiver, bool decompress) {
bool exceed_payload_max_length = false;
return prepare_content_receiver(
x, status, std::move(receiver), decompress,
[&](const ContentReceiverWithProgress &out) {
x, status, std::move(receiver), decompress, payload_max_length,
exceed_payload_max_length, [&](const ContentReceiverWithProgress &out) {
auto ret = true;
auto exceed_payload_max_length = false;
// Note: exceed_payload_max_length may also be set by the decompressor
// wrapper in prepare_content_receiver when the decompressed payload
// size exceeds the limit.
if (is_chunked_transfer_encoding(x.headers)) {
auto result = read_content_chunked(strm, x, payload_max_length, out);
@@ -3603,12 +3653,11 @@ std::string normalize_query_string(const std::string &query) {
bool parse_multipart_boundary(const std::string &content_type,
std::string &boundary) {
auto boundary_keyword = "boundary=";
auto pos = content_type.find(boundary_keyword);
if (pos == std::string::npos) { return false; }
auto end = content_type.find(';', pos);
auto beg = pos + strlen(boundary_keyword);
boundary = trim_double_quotes_copy(content_type.substr(beg, end - beg));
std::map<std::string, std::string> params;
extract_media_type(content_type, &params);
auto it = params.find("boundary");
if (it == params.end()) { return false; }
boundary = it->second;
return !boundary.empty();
}
@@ -3776,11 +3825,7 @@ bool parse_accept_header(const std::string &s,
}
// Remove additional parameters from media type
auto param_pos = accept_entry.media_type.find(';');
if (param_pos != std::string::npos) {
accept_entry.media_type =
trim_copy(accept_entry.media_type.substr(0, param_pos));
}
accept_entry.media_type = extract_media_type(accept_entry.media_type);
// Basic validation of media type format
if (accept_entry.media_type.empty()) {
@@ -5610,7 +5655,7 @@ size_t Request::get_param_value_count(const std::string &key) const {
bool Request::is_multipart_form_data() const {
const auto &content_type = get_header_value("Content-Type");
return !content_type.rfind("multipart/form-data", 0);
return detail::extract_media_type(content_type) == "multipart/form-data";
}
// Multipart FormData implementation
@@ -7092,7 +7137,8 @@ bool Server::read_content(Stream &strm, Request &req, Response &res) {
return true;
})) {
const auto &content_type = req.get_header_value("Content-Type");
if (!content_type.find("application/x-www-form-urlencoded")) {
if (detail::extract_media_type(content_type) ==
"application/x-www-form-urlencoded") {
if (req.body.size() > CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH) {
res.status = StatusCode::PayloadTooLarge_413; // NOTE: should be 414?
output_error_log(Error::ExceedMaxPayloadSize, &req);
@@ -7479,45 +7525,63 @@ bool Server::routing(Request &req, Response &res, Stream &strm) {
if (detail::expect_content(req)) {
// Content reader handler
{
// Track whether the ContentReader was aborted due to the decompressed
// payload exceeding `payload_max_length_`.
// The user handler runs after the lambda returns, so we must restore the
// 413 status if the handler overwrites it.
bool content_reader_payload_too_large = false;
ContentReader reader(
[&](ContentReceiver receiver) {
auto result = read_content_with_content_receiver(
strm, req, res, std::move(receiver), nullptr, nullptr);
if (!result) { output_error_log(Error::Read, &req); }
if (!result) {
output_error_log(Error::Read, &req);
if (res.status == StatusCode::PayloadTooLarge_413) {
content_reader_payload_too_large = true;
}
}
return result;
},
[&](FormDataHeader header, ContentReceiver receiver) {
auto result = read_content_with_content_receiver(
strm, req, res, nullptr, std::move(header),
std::move(receiver));
if (!result) { output_error_log(Error::Read, &req); }
if (!result) {
output_error_log(Error::Read, &req);
if (res.status == StatusCode::PayloadTooLarge_413) {
content_reader_payload_too_large = true;
}
}
return result;
});
bool dispatched = false;
if (req.method == "POST") {
if (dispatch_request_for_content_reader(
req, res, std::move(reader),
post_handlers_for_content_reader_)) {
return true;
}
dispatched = dispatch_request_for_content_reader(
req, res, std::move(reader), post_handlers_for_content_reader_);
} else if (req.method == "PUT") {
if (dispatch_request_for_content_reader(
req, res, std::move(reader),
put_handlers_for_content_reader_)) {
return true;
}
dispatched = dispatch_request_for_content_reader(
req, res, std::move(reader), put_handlers_for_content_reader_);
} else if (req.method == "PATCH") {
if (dispatch_request_for_content_reader(
req, res, std::move(reader),
patch_handlers_for_content_reader_)) {
return true;
}
dispatched = dispatch_request_for_content_reader(
req, res, std::move(reader), patch_handlers_for_content_reader_);
} else if (req.method == "DELETE") {
if (dispatch_request_for_content_reader(
req, res, std::move(reader),
delete_handlers_for_content_reader_)) {
return true;
dispatched = dispatch_request_for_content_reader(
req, res, std::move(reader), delete_handlers_for_content_reader_);
}
if (dispatched) {
if (content_reader_payload_too_large) {
// Enforce the limit: override any status the handler may have set
// and return false so the error path sends a plain 413 response.
res.status = StatusCode::PayloadTooLarge_413;
res.body.clear();
res.content_length_ = 0;
res.content_provider_ = nullptr;
return false;
}
return true;
}
}
@@ -7930,16 +7994,6 @@ Server::process_request(Stream &strm, const std::string &remote_addr,
routed = true;
} else {
res.status = StatusCode::InternalServerError_500;
std::string val;
auto s = e.what();
for (size_t i = 0; s[i]; i++) {
switch (s[i]) {
case '\r': val += "\\r"; break;
case '\n': val += "\\n"; break;
default: val += s[i]; break;
}
}
res.set_header("EXCEPTION_WHAT", val);
}
} catch (...) {
if (exception_handler_) {
@@ -7948,7 +8002,6 @@ Server::process_request(Stream &strm, const std::string &remote_addr,
routed = true;
} else {
res.status = StatusCode::InternalServerError_500;
res.set_header("EXCEPTION_WHAT", "UNKNOWN");
}
}
#endif
@@ -11629,8 +11682,7 @@ void SSLClient::set_session_verifier(
session_verifier_ = std::move(verifier);
}
#if defined(_WIN32) && \
!defined(CPPHTTPLIB_DISABLE_WINDOWS_AUTOMATIC_ROOT_CERTIFICATES_UPDATE)
#ifdef CPPHTTPLIB_WINDOWS_AUTOMATIC_ROOT_CERTIFICATES_UPDATE
void SSLClient::enable_windows_certificate_verification(bool enabled) {
enable_windows_cert_verification_ = enabled;
}
@@ -11788,8 +11840,7 @@ bool SSLClient::initialize_ssl(Socket &socket, Error &error) {
}
}
#if defined(_WIN32) && \
!defined(CPPHTTPLIB_DISABLE_WINDOWS_AUTOMATIC_ROOT_CERTIFICATES_UPDATE)
#ifdef CPPHTTPLIB_WINDOWS_AUTOMATIC_ROOT_CERTIFICATES_UPDATE
// Additional Windows Schannel verification.
// This provides real-time certificate validation with Windows Update
// integration, working with both OpenSSL and MbedTLS backends.
@@ -11835,8 +11886,7 @@ void Client::enable_server_hostname_verification(bool enabled) {
cli_->enable_server_hostname_verification(enabled);
}
#if defined(_WIN32) && \
!defined(CPPHTTPLIB_DISABLE_WINDOWS_AUTOMATIC_ROOT_CERTIFICATES_UPDATE)
#ifdef CPPHTTPLIB_WINDOWS_AUTOMATIC_ROOT_CERTIFICATES_UPDATE
void Client::enable_windows_certificate_verification(bool enabled) {
if (is_ssl_) {
static_cast<SSLClient &>(*cli_).enable_windows_certificate_verification(
@@ -11959,7 +12009,7 @@ bool enumerate_windows_system_certs(Callback cb) {
}
#endif
#if defined(__APPLE__) && defined(CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN)
#ifdef CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN
// Enumerate macOS Keychain certificates and call callback with DER data
template <typename Callback>
bool enumerate_macos_keychain_certs(Callback cb) {

View File

@@ -8,8 +8,8 @@
#ifndef CPPHTTPLIB_HTTPLIB_H
#define CPPHTTPLIB_HTTPLIB_H
#define CPPHTTPLIB_VERSION "0.34.0"
#define CPPHTTPLIB_VERSION_NUM "0x002200"
#define CPPHTTPLIB_VERSION "0.35.0"
#define CPPHTTPLIB_VERSION_NUM "0x002300"
/*
* Platform compatibility check
@@ -357,14 +357,32 @@ using socket_t = int;
#include <any>
#endif
// On macOS with a TLS backend, enable Keychain root certificates by default
// unless the user explicitly opts out.
#if defined(__APPLE__) && \
!defined(CPPHTTPLIB_DISABLE_MACOSX_AUTOMATIC_ROOT_CERTIFICATES) && \
(defined(CPPHTTPLIB_OPENSSL_SUPPORT) || \
defined(CPPHTTPLIB_MBEDTLS_SUPPORT) || \
defined(CPPHTTPLIB_WOLFSSL_SUPPORT))
#ifndef CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN
#define CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN
#endif
#endif
// On Windows, enable Schannel certificate verification by default
// unless the user explicitly opts out.
#if defined(_WIN32) && \
!defined(CPPHTTPLIB_DISABLE_WINDOWS_AUTOMATIC_ROOT_CERTIFICATES_UPDATE)
#define CPPHTTPLIB_WINDOWS_AUTOMATIC_ROOT_CERTIFICATES_UPDATE
#endif
#if defined(CPPHTTPLIB_USE_NON_BLOCKING_GETADDRINFO) || \
defined(CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN)
#if TARGET_OS_MAC
#include <CFNetwork/CFHost.h>
#include <CoreFoundation/CoreFoundation.h>
#endif
#endif // CPPHTTPLIB_USE_NON_BLOCKING_GETADDRINFO or
// CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN
#endif
#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
#ifdef _WIN32
@@ -382,11 +400,11 @@ using socket_t = int;
#endif
#endif // _WIN32
#if defined(CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN)
#ifdef CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN
#if TARGET_OS_MAC
#include <Security/Security.h>
#endif
#endif // CPPHTTPLIB_USE_NON_BLOCKING_GETADDRINFO
#endif
#include <openssl/err.h>
#include <openssl/evp.h>
@@ -430,11 +448,11 @@ using socket_t = int;
#pragma comment(lib, "crypt32.lib")
#endif
#endif // _WIN32
#if defined(CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN)
#ifdef CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN
#if TARGET_OS_MAC
#include <Security/Security.h>
#endif
#endif // CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN
#endif
// Mbed TLS 3.x API compatibility
#if MBEDTLS_VERSION_MAJOR >= 3
@@ -473,11 +491,11 @@ using socket_t = int;
#pragma comment(lib, "crypt32.lib")
#endif
#endif // _WIN32
#if defined(CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN)
#ifdef CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN
#if TARGET_OS_MAC
#include <Security/Security.h>
#endif
#endif // CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN
#endif
#endif // CPPHTTPLIB_WOLFSSL_SUPPORT
// Define CPPHTTPLIB_SSL_ENABLED if any SSL backend is available
@@ -2557,8 +2575,7 @@ public:
tls::ctx_t tls_context() const;
#if defined(_WIN32) && \
!defined(CPPHTTPLIB_DISABLE_WINDOWS_AUTOMATIC_ROOT_CERTIFICATES_UPDATE)
#ifdef CPPHTTPLIB_WINDOWS_AUTOMATIC_ROOT_CERTIFICATES_UPDATE
void enable_windows_certificate_verification(bool enabled);
#endif
@@ -2679,8 +2696,7 @@ public:
tls::ctx_t tls_context() const { return ctx_; }
#if defined(_WIN32) && \
!defined(CPPHTTPLIB_DISABLE_WINDOWS_AUTOMATIC_ROOT_CERTIFICATES_UPDATE)
#ifdef CPPHTTPLIB_WINDOWS_AUTOMATIC_ROOT_CERTIFICATES_UPDATE
void enable_windows_certificate_verification(bool enabled);
#endif
@@ -2712,8 +2728,7 @@ private:
std::function<SSLVerifierResponse(tls::session_t)> session_verifier_;
#if defined(_WIN32) && \
!defined(CPPHTTPLIB_DISABLE_WINDOWS_AUTOMATIC_ROOT_CERTIFICATES_UPDATE)
#ifdef CPPHTTPLIB_WINDOWS_AUTOMATIC_ROOT_CERTIFICATES_UPDATE
bool enable_windows_cert_verification_ = true;
#endif

File diff suppressed because it is too large Load Diff