mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-30 16:47:31 +03:00
Compare commits
4 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5866e3bbc8 | ||
|
|
0516e04bf9 | ||
|
|
3d9ab225e7 | ||
|
|
d63aa398de |
@@ -11,6 +11,10 @@ endif()
|
||||
list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
|
||||
list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake")
|
||||
|
||||
if (NOT DEFINED CMAKE_HIP_FLAGS_DEBUG)
|
||||
set(CMAKE_HIP_FLAGS_DEBUG "-g -O2")
|
||||
endif()
|
||||
|
||||
# CMake on Windows doesn't support the HIP language yet
|
||||
if (WIN32)
|
||||
set(CXX_IS_HIPCC TRUE)
|
||||
|
||||
@@ -132,6 +132,7 @@ set(GGML_OPENCL_KERNELS
|
||||
ssm_conv
|
||||
sub
|
||||
sum_rows
|
||||
cumsum
|
||||
transpose
|
||||
concat
|
||||
tsembd
|
||||
|
||||
@@ -547,6 +547,7 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
|
||||
cl_kernel kernel_argsort_f32_i32;
|
||||
cl_kernel kernel_sum_rows_f32, kernel_sum_rows_f32_4;
|
||||
cl_kernel kernel_cumsum_blk, kernel_cumsum_add;
|
||||
cl_kernel kernel_repeat_f32;
|
||||
cl_kernel kernel_pad;
|
||||
cl_kernel kernel_tanh_f32, kernel_tanh_f32_4, kernel_tanh_f32_nc;
|
||||
@@ -1927,6 +1928,24 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// cumsum
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "cumsum.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("cumsum.cl");
|
||||
#endif
|
||||
cl_program prog;
|
||||
prog = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_cumsum_blk = clCreateKernel(prog, "kernel_cumsum_blk", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_cumsum_add = clCreateKernel(prog, "kernel_cumsum_add", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
}
|
||||
|
||||
// sigmoid
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
@@ -3803,6 +3822,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
return cols <= max_workgroup_size && op->src[0]->type == GGML_TYPE_F32;
|
||||
}
|
||||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_CUMSUM:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
|
||||
case GGML_OP_MEAN:
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
@@ -5775,19 +5796,12 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
const int ne00 = src0->ne[0];
|
||||
const cl_ulong nb01 = src0->nb[1];
|
||||
const cl_ulong nb02 = src0->nb[2];
|
||||
const cl_ulong nb03 = src0->nb[3];
|
||||
const int ne10 = src1->ne[0];
|
||||
const cl_ulong nb10 = src1->nb[0];
|
||||
const int ne11 = src1->ne[1];
|
||||
const int ne12 = src1->ne[2];
|
||||
const cl_ulong nb11 = src1->nb[1];
|
||||
const cl_ulong nb12 = src1->nb[2];
|
||||
const cl_ulong nb1 = dst->nb[1];
|
||||
const cl_ulong nb2 = dst->nb[2];
|
||||
const cl_ulong nb3 = dst->nb[3];
|
||||
GGML_TENSOR_LOCALS(int, ne0, src0, ne);
|
||||
GGML_TENSOR_LOCALS(cl_ulong, nb0, src0, nb);
|
||||
GGML_TENSOR_LOCALS(int, ne1, src1, ne);
|
||||
GGML_TENSOR_LOCALS(cl_ulong, nb1, src1, nb);
|
||||
GGML_TENSOR_LOCALS(int, ne, dst, ne);
|
||||
GGML_TENSOR_LOCALS(cl_ulong, nb, dst, nb);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
@@ -5833,8 +5847,14 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb3));
|
||||
|
||||
size_t global_work_size[] = {(size_t)ne10*64, (size_t)ne11, (size_t)ne12};
|
||||
size_t local_work_size[] = {64, 1, 1};
|
||||
int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel);
|
||||
int nth = 1;
|
||||
while (nth < ne00 && 2*nth <= max_workgroup_size) {
|
||||
nth *= 2;
|
||||
}
|
||||
|
||||
size_t global_work_size[] = {(size_t)ne10*nth, (size_t)ne11, (size_t)ne12};
|
||||
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
}
|
||||
@@ -11949,6 +11969,118 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
}
|
||||
|
||||
static void ggml_cl_cumsum(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
GGML_UNUSED(src1);
|
||||
|
||||
GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
GGML_TENSOR_LOCALS(int, ne0, src0, ne);
|
||||
GGML_TENSOR_LOCALS(cl_ulong, nb0, src0, nb);
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_cumsum_blk;
|
||||
|
||||
int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel);
|
||||
int nth = 1;
|
||||
while (nth < ne00 && 2*nth <= max_workgroup_size) {
|
||||
nth *= 2;
|
||||
}
|
||||
|
||||
GGML_ASSERT(ne00 <= nth*nth);
|
||||
|
||||
const int net0 = CEIL_DIV(ne00, nth);
|
||||
const int net1 = ne01;
|
||||
const int net2 = ne02;
|
||||
const int net3 = ne03;
|
||||
|
||||
const cl_ulong nbt0 = sizeof(float);
|
||||
const cl_ulong nbt1 = net0*nbt0;
|
||||
const cl_ulong nbt2 = net1*nbt1;
|
||||
const cl_ulong nbt3 = net2*nbt2;
|
||||
|
||||
static ggml_cl_buffer tmp_buffer;
|
||||
tmp_buffer.allocate(backend_ctx->context, net0*ne01*ne02*ne03*sizeof(float));
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &tmp_buffer.buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &net0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &net1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &net2));
|
||||
|
||||
size_t global_work_size[] = { (size_t)(nth*net0*ne01), (size_t)ne02, (size_t)ne03};
|
||||
size_t local_work_size[] = { (size_t)nth, 1, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
|
||||
if(ne00 > nth) {
|
||||
// if a single workgroup cannot handle an entire row, each workgroup
|
||||
// computes a partial sum and stores to dst, tmp_buffer contains the sum
|
||||
// of the each workgroup; cumsum this buffer and add to the partial sums in dst
|
||||
cl_ulong offsett = 0;
|
||||
kernel = backend_ctx->kernel_cumsum_blk;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &tmp_buffer.buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offsett));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &tmp_buffer.buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &tmp_buffer.buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &offsett));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &net0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nbt0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nbt1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nbt2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nbt3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &net0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &net1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &net2));
|
||||
|
||||
size_t global_work_size_1[] = { (size_t)net1*nth, (size_t)net2, (size_t)net3};
|
||||
size_t local_work_size_1[] = { (size_t)nth, 1, 1};
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size_1, local_work_size_1, dst);
|
||||
|
||||
kernel = backend_ctx->kernel_cumsum_add;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &tmp_buffer.buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &nbt0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &nbt1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &nbt2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &nbt3));
|
||||
|
||||
size_t global_work_size_2[] = { (size_t)(nth*net0*ne01), (size_t)ne02, (size_t)ne03};
|
||||
size_t local_work_size_2[] = { (size_t)nth, 1, 1};
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size_2, local_work_size_2, dst);
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cl_glu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
@@ -12391,6 +12523,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
||||
}
|
||||
func = ggml_cl_sum_rows;
|
||||
break;
|
||||
case GGML_OP_CUMSUM:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cl_cumsum;
|
||||
break;
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
|
||||
139
ggml/src/ggml-opencl/kernels/cumsum.cl
Normal file
139
ggml/src/ggml-opencl/kernels/cumsum.cl
Normal file
@@ -0,0 +1,139 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#ifdef cl_intel_required_subgroup_size
|
||||
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||
#define INTEL_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
|
||||
// max workgroup size is usually 1024, this covers various subgroups sizes
|
||||
#define MAX_SUBGROUPS 128
|
||||
|
||||
#ifdef INTEL_GPU
|
||||
REQD_SUBGROUP_SIZE_32
|
||||
#elif defined (ADRENO_GPU)
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
kernel void kernel_cumsum_blk(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * tmp,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
uint net0,
|
||||
uint net1,
|
||||
uint net2
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
dst = dst + offsetd;
|
||||
|
||||
const int i3 = get_group_id(2);
|
||||
const int i2 = get_group_id(1);
|
||||
const int i1 = get_group_id(0);
|
||||
|
||||
const int nth = get_local_size(0);
|
||||
const int tid = get_local_id(0);
|
||||
|
||||
const uint sg_size = get_sub_group_size();
|
||||
const uint sg_id = get_sub_group_id();
|
||||
const uint sg_lid = get_sub_group_local_id();
|
||||
|
||||
const int ib = i1 / ne01;
|
||||
const int i00 = ib * nth;
|
||||
const int i01 = i1 % ne01;
|
||||
const int i02 = i2;
|
||||
const int i03 = i3;
|
||||
|
||||
global const float * src0_row = (global const float *)(src0 + i03*nb03 + i02*nb02 + i01*nb01);
|
||||
global float * tmp_row = (global float *)tmp + net0 * i01 + net0 * net1 * i02 + net0 * net1 * net2 * i03;
|
||||
global float * dst_row = (global float *)dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||
|
||||
__local float partial[MAX_SUBGROUPS];
|
||||
|
||||
float v = 0.0f;
|
||||
if (i00 + tid < ne00) {
|
||||
v = src0_row[i00 + tid];
|
||||
}
|
||||
|
||||
float s = sub_group_scan_inclusive_add(v);
|
||||
if (sg_lid == sg_size - 1) {
|
||||
partial[sg_id] = s;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// NB: subgroup size should be larger than number of subgroups
|
||||
// assuming max workgroup size of 1024, subgroup size should be >= 32
|
||||
if (sg_id == 0) {
|
||||
float x = 0.0f;
|
||||
if (sg_lid < get_num_sub_groups()) {
|
||||
x = partial[sg_lid];
|
||||
}
|
||||
float ex = sub_group_scan_exclusive_add(x);
|
||||
if (sg_lid < get_num_sub_groups()) {
|
||||
partial[sg_lid] = ex;
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
s += partial[sg_id];
|
||||
|
||||
if (i00 + tid < ne00) {
|
||||
dst_row[i00 + tid] = s;
|
||||
}
|
||||
if (ne00 > nth && tid == nth - 1) {
|
||||
tmp_row[ib] = s;
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_cumsum_add(
|
||||
global char * tmp,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
uint nbt0,
|
||||
uint nbt1,
|
||||
uint nbt2,
|
||||
uint nbt3
|
||||
) {
|
||||
dst = dst + offsetd;
|
||||
|
||||
const int i3 = get_group_id(2);
|
||||
const int i2 = get_group_id(1);
|
||||
const int i1 = get_group_id(0);
|
||||
|
||||
const int nth = get_local_size(0);
|
||||
const int tid = get_local_id(0);
|
||||
|
||||
const int ib = i1 / ne01;
|
||||
if (ib == 0) {
|
||||
return;
|
||||
}
|
||||
const int i00 = ib * nth;
|
||||
const int i01 = i1 % ne01;
|
||||
const int i02 = i2;
|
||||
const int i03 = i3;
|
||||
|
||||
global float * tmp_row = (global float *)(tmp + nbt1 * i01 + nbt2 * i02 + nbt3 * i03);
|
||||
global float * dst_row = (global float *)dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||
|
||||
if (i00 + tid < ne00) {
|
||||
dst_row[i00 + tid] += tmp_row[ib - 1];
|
||||
}
|
||||
}
|
||||
@@ -27,6 +27,7 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher();
|
||||
#include <iostream>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
#include <deque>
|
||||
#include <sstream>
|
||||
#include <utility>
|
||||
#include <memory>
|
||||
@@ -188,6 +189,11 @@ struct ggml_backend_vk_buffer_type_context {
|
||||
|
||||
struct vk_queue;
|
||||
|
||||
struct vk_command_buffer {
|
||||
vk::CommandBuffer buf;
|
||||
bool in_use = false;
|
||||
};
|
||||
|
||||
// Stores command pool/buffers. There's an instance of this
|
||||
// for each (context,queue) pair and for each (device,queue) pair.
|
||||
struct vk_command_pool {
|
||||
@@ -195,10 +201,16 @@ struct vk_command_pool {
|
||||
void destroy(vk::Device& device);
|
||||
|
||||
vk::CommandPool pool;
|
||||
uint32_t cmd_buffer_idx;
|
||||
std::vector<vk::CommandBuffer> cmd_buffers;
|
||||
// Using deque so the pointers to command buffers
|
||||
// remain valid even if we add more
|
||||
std::deque<vk_command_buffer> cmd_buffers;
|
||||
|
||||
vk_queue *q;
|
||||
|
||||
size_t buffers_in_use() const {
|
||||
return std::count_if(cmd_buffers.begin(), cmd_buffers.end(),
|
||||
[](const auto& cb) { return cb.in_use; });
|
||||
}
|
||||
};
|
||||
|
||||
// Prevent simultaneous submissions to the same queue.
|
||||
@@ -878,10 +890,12 @@ struct vk_device_struct {
|
||||
};
|
||||
|
||||
void vk_command_pool::init(vk_device& device, vk_queue *q_) {
|
||||
cmd_buffer_idx = 0;
|
||||
cmd_buffers.clear();
|
||||
q = q_;
|
||||
|
||||
vk::CommandPoolCreateInfo command_pool_create_info(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), q->queue_family_index);
|
||||
vk::CommandPoolCreateInfo command_pool_create_info(
|
||||
vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT),
|
||||
q->queue_family_index);
|
||||
pool = device->device.createCommandPool(command_pool_create_info);
|
||||
}
|
||||
|
||||
@@ -929,6 +943,7 @@ struct vk_subbuffer {
|
||||
struct vk_event {
|
||||
vk::Event event;
|
||||
vk::Fence fence;
|
||||
vk_command_buffer* cmd_buffer = nullptr;
|
||||
};
|
||||
|
||||
struct vk_semaphore {
|
||||
@@ -937,7 +952,7 @@ struct vk_semaphore {
|
||||
};
|
||||
|
||||
struct vk_submission {
|
||||
vk::CommandBuffer buffer;
|
||||
vk_command_buffer* buffer = nullptr;
|
||||
std::vector<vk_semaphore> wait_semaphores;
|
||||
std::vector<vk_semaphore> signal_semaphores;
|
||||
};
|
||||
@@ -2283,25 +2298,15 @@ static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx
|
||||
}
|
||||
}
|
||||
|
||||
static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_device& device, vk_command_pool& p) {
|
||||
static vk_command_buffer* ggml_vk_create_cmd_buffer(vk_device& device, vk_command_pool& p) {
|
||||
VK_LOG_DEBUG("ggml_vk_create_cmd_buffer()");
|
||||
|
||||
if (p.cmd_buffers.size() > p.cmd_buffer_idx) {
|
||||
// Reuse command buffer
|
||||
return p.cmd_buffers[p.cmd_buffer_idx++];
|
||||
}
|
||||
|
||||
vk::CommandBufferAllocateInfo command_buffer_alloc_info(
|
||||
p.pool,
|
||||
vk::CommandBufferLevel::ePrimary,
|
||||
1);
|
||||
const std::vector<vk::CommandBuffer> cmd_buffers = device->device.allocateCommandBuffers(command_buffer_alloc_info);
|
||||
auto buf = cmd_buffers.front();
|
||||
|
||||
p.cmd_buffers.push_back(buf);
|
||||
p.cmd_buffer_idx++;
|
||||
|
||||
return buf;
|
||||
p.cmd_buffers.push_back({ cmd_buffers.front(), true });
|
||||
return &p.cmd_buffers[p.cmd_buffers.size()-1];
|
||||
}
|
||||
|
||||
static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
|
||||
@@ -2368,7 +2373,7 @@ static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
|
||||
tl_wait_semaphores[idx].data(),
|
||||
stage_flags[idx].data(),
|
||||
1,
|
||||
&submission.buffer,
|
||||
&submission.buffer->buf,
|
||||
(uint32_t) submission.signal_semaphores.size(),
|
||||
tl_signal_semaphores[idx].data(),
|
||||
};
|
||||
@@ -2492,7 +2497,11 @@ static void ggml_vk_command_pool_cleanup(vk_device& device, vk_command_pool& p)
|
||||
|
||||
// Requires command buffers to be done
|
||||
device->device.resetCommandPool(p.pool);
|
||||
p.cmd_buffer_idx = 0;
|
||||
// Don't clear the command buffers and mark them as not in use.
|
||||
// This allows us to reuse them
|
||||
for (auto& cmd_buffer : p.cmd_buffers) {
|
||||
cmd_buffer.in_use = false;
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_vk_queue_command_pools_cleanup(vk_device& device) {
|
||||
@@ -2501,10 +2510,10 @@ static void ggml_vk_queue_command_pools_cleanup(vk_device& device) {
|
||||
// Arbitrary frequency to cleanup/reuse command buffers
|
||||
static constexpr uint32_t cleanup_frequency = 10;
|
||||
|
||||
if (device->compute_queue.cmd_pool.cmd_buffer_idx >= cleanup_frequency) {
|
||||
if (device->compute_queue.cmd_pool.buffers_in_use() >= cleanup_frequency) {
|
||||
ggml_vk_command_pool_cleanup(device, device->compute_queue.cmd_pool);
|
||||
}
|
||||
if (device->transfer_queue.cmd_pool.cmd_buffer_idx >= cleanup_frequency) {
|
||||
if (device->transfer_queue.cmd_pool.buffers_in_use() >= cleanup_frequency) {
|
||||
ggml_vk_command_pool_cleanup(device, device->transfer_queue.cmd_pool);
|
||||
}
|
||||
}
|
||||
@@ -2752,7 +2761,7 @@ static void ggml_vk_sync_buffers(ggml_backend_vk_context* ctx, vk_context& subct
|
||||
ctx->prealloc_x_need_sync = ctx->prealloc_y_need_sync = ctx->prealloc_split_k_need_sync = false;
|
||||
}
|
||||
|
||||
subctx->s->buffer.pipelineBarrier(
|
||||
subctx->s->buffer->buf.pipelineBarrier(
|
||||
subctx->p->q->stage_flags,
|
||||
subctx->p->q->stage_flags,
|
||||
{},
|
||||
@@ -2768,7 +2777,7 @@ static void ggml_vk_sync_buffers(ggml_backend_vk_context* ctx, vk_context& subct
|
||||
static void ggml_vk_set_event(vk_context& ctx, vk::Event& event) {
|
||||
VK_LOG_DEBUG("ggml_vk_set_event()");
|
||||
|
||||
ctx->s->buffer.setEvent(
|
||||
ctx->s->buffer->buf.setEvent(
|
||||
event,
|
||||
ctx->p->q->stage_flags
|
||||
);
|
||||
@@ -2780,7 +2789,7 @@ static void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events
|
||||
return;
|
||||
}
|
||||
|
||||
ctx->s->buffer.waitEvents(
|
||||
ctx->s->buffer->buf.waitEvents(
|
||||
events,
|
||||
ctx->p->q->stage_flags,
|
||||
ctx->p->q->stage_flags,
|
||||
@@ -6348,13 +6357,24 @@ static vk_subbuffer ggml_vk_tensor_subbuffer(
|
||||
return vk_subbuffer{buffer, offset, size};
|
||||
}
|
||||
|
||||
// Get a command buffer from pool. Create a new one if no reusable buffer is available
|
||||
static vk_command_buffer* ggml_vk_get_or_create_cmd_buffer(vk_device& device, vk_command_pool& pool) {
|
||||
for (auto& cmd_buffer : pool.cmd_buffers) {
|
||||
if (!cmd_buffer.in_use) {
|
||||
cmd_buffer.in_use = true;
|
||||
return &cmd_buffer;
|
||||
}
|
||||
}
|
||||
return ggml_vk_create_cmd_buffer(device, pool);
|
||||
}
|
||||
|
||||
static vk_submission ggml_vk_begin_submission(vk_device& device, vk_command_pool& p, bool one_time = true) {
|
||||
vk_submission s;
|
||||
s.buffer = ggml_vk_create_cmd_buffer(device, p);
|
||||
s.buffer = ggml_vk_get_or_create_cmd_buffer(device, p);
|
||||
if (one_time) {
|
||||
s.buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit });
|
||||
s.buffer->buf.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit });
|
||||
} else {
|
||||
s.buffer.begin({ vk::CommandBufferUsageFlags{} });
|
||||
s.buffer->buf.begin({ vk::CommandBufferUsageFlags{} });
|
||||
}
|
||||
|
||||
return s;
|
||||
@@ -6407,18 +6427,18 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context&
|
||||
vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() };
|
||||
ctx->device->device.updateDescriptorSets({ write_descriptor_set }, {});
|
||||
|
||||
subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size(push_constants), push_constant_data(push_constants));
|
||||
subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline);
|
||||
subctx->s->buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute,
|
||||
subctx->s->buffer->buf.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size(push_constants), push_constant_data(push_constants));
|
||||
subctx->s->buffer->buf.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline);
|
||||
subctx->s->buffer->buf.bindDescriptorSets(vk::PipelineBindPoint::eCompute,
|
||||
pipeline->layout,
|
||||
0,
|
||||
{ descriptor_set },
|
||||
{});
|
||||
subctx->s->buffer.dispatch(wg0, wg1, wg2);
|
||||
subctx->s->buffer->buf.dispatch(wg0, wg1, wg2);
|
||||
}
|
||||
|
||||
static void ggml_vk_end_submission(vk_submission& s, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores) {
|
||||
s.buffer.end();
|
||||
s.buffer->buf.end();
|
||||
|
||||
s.wait_semaphores = std::move(wait_semaphores);
|
||||
s.signal_semaphores = std::move(signal_semaphores);
|
||||
@@ -6430,7 +6450,7 @@ static void ggml_vk_ctx_end(vk_context& ctx) {
|
||||
return;
|
||||
}
|
||||
|
||||
ctx->s->buffer.end();
|
||||
ctx->s->buffer->buf.end();
|
||||
ctx->s = nullptr;
|
||||
}
|
||||
|
||||
@@ -6584,7 +6604,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
|
||||
}
|
||||
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices);
|
||||
subctx->s->buffer->buf.copyBuffer(buf->buffer, dst->buffer, slices);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -6599,7 +6619,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
|
||||
VkBufferCopy buf_copy{ 0, offset, copy_size };
|
||||
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
vkCmdCopyBuffer(subctx->s->buffer, (VkBuffer)staging->buffer, (VkBuffer)dst->buffer, 1, &buf_copy);
|
||||
vkCmdCopyBuffer(subctx->s->buffer->buf, (VkBuffer)staging->buffer, (VkBuffer)dst->buffer, 1, &buf_copy);
|
||||
|
||||
for (uint64_t i3 = 0; i3 < ne3; i3++) {
|
||||
for (uint64_t i2 = 0; i2 < ne2; i2++) {
|
||||
@@ -6648,7 +6668,7 @@ static bool ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, siz
|
||||
}
|
||||
|
||||
ggml_vk_sync_buffers(nullptr, subctx);
|
||||
subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices);
|
||||
subctx->s->buffer->buf.copyBuffer(buf->buffer, dst->buffer, slices);
|
||||
return true;
|
||||
}
|
||||
VK_LOG_DEBUG("STAGING");
|
||||
@@ -6670,7 +6690,7 @@ static bool ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, siz
|
||||
copy_size};
|
||||
|
||||
ggml_vk_sync_buffers(nullptr, subctx);
|
||||
vkCmdCopyBuffer(subctx->s->buffer, (VkBuffer)staging_buffer->buffer, (VkBuffer)dst->buffer, 1, &buf_copy);
|
||||
vkCmdCopyBuffer(subctx->s->buffer->buf, (VkBuffer)staging_buffer->buffer, (VkBuffer)dst->buffer, 1, &buf_copy);
|
||||
|
||||
if (width == spitch) {
|
||||
deferred_memcpy((uint8_t *)staging_buffer->ptr, src, width * height, &subctx->in_memcpys);
|
||||
@@ -6756,7 +6776,7 @@ static bool ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size
|
||||
if (buf != nullptr) {
|
||||
// Memory is pinned, use as staging buffer
|
||||
ggml_vk_sync_buffers(nullptr, subctx);
|
||||
subctx->s->buffer.copyBuffer(src->buffer, buf->buffer, slices);
|
||||
subctx->s->buffer->buf.copyBuffer(src->buffer, buf->buffer, slices);
|
||||
|
||||
return true;
|
||||
}
|
||||
@@ -6774,7 +6794,7 @@ static bool ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size
|
||||
vk_buffer& staging_buffer = src->device->sync_staging;
|
||||
|
||||
ggml_vk_sync_buffers(nullptr, subctx);
|
||||
subctx->s->buffer.copyBuffer(src->buffer, staging_buffer->buffer, slices);
|
||||
subctx->s->buffer->buf.copyBuffer(src->buffer, staging_buffer->buffer, slices);
|
||||
|
||||
deferred_memcpy(dst, staging_buffer->ptr, copy_size, &subctx->out_memcpys);
|
||||
return true;
|
||||
@@ -6821,7 +6841,7 @@ static void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t ds
|
||||
|
||||
VkBufferCopy bc{ src_offset, dst_offset, size };
|
||||
|
||||
vkCmdCopyBuffer(ctx->s->buffer, (VkBuffer)src->buffer, (VkBuffer)dst->buffer, 1, &bc);
|
||||
vkCmdCopyBuffer(ctx->s->buffer->buf, (VkBuffer)src->buffer, (VkBuffer)dst->buffer, 1, &bc);
|
||||
}
|
||||
|
||||
static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
|
||||
@@ -6859,7 +6879,7 @@ static void ggml_vk_buffer_memset_async(vk_context& ctx, vk_buffer& dst, size_t
|
||||
}
|
||||
|
||||
// Fall back to GPU fillBuffer for non-UMA or non-host-visible buffers
|
||||
ctx->s->buffer.fillBuffer(dst->buffer, offset, size, c);
|
||||
ctx->s->buffer->buf.fillBuffer(dst->buffer, offset, size, c);
|
||||
}
|
||||
|
||||
static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
|
||||
@@ -6874,7 +6894,7 @@ static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, siz
|
||||
std::lock_guard<std::recursive_mutex> guard(dst->device->mutex);
|
||||
vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool);
|
||||
ggml_vk_ctx_begin(dst->device, subctx);
|
||||
subctx->s->buffer.fillBuffer(dst->buffer, offset, size, c);
|
||||
subctx->s->buffer->buf.fillBuffer(dst->buffer, offset, size, c);
|
||||
ggml_vk_ctx_end(subctx);
|
||||
|
||||
ggml_vk_submit(subctx, dst->device->fence);
|
||||
@@ -12682,7 +12702,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||
|
||||
if (vk_perf_logger_enabled && vk_perf_logger_concurrent) {
|
||||
ctx->query_node_idx[ctx->query_idx] = node_idx;
|
||||
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
|
||||
compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
|
||||
}
|
||||
}
|
||||
// Add all fused nodes to the unsynchronized lists.
|
||||
@@ -13521,7 +13541,7 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor
|
||||
buffer_cpy.dstOffset = dst_offset;
|
||||
buffer_cpy.size = size;
|
||||
|
||||
cpy_ctx->s->buffer.copyBuffer(ctx->sync_staging->buffer, buf->buffer, { buffer_cpy });
|
||||
cpy_ctx->s->buffer->buf.copyBuffer(ctx->sync_staging->buffer, buf->buffer, { buffer_cpy });
|
||||
deferred_memcpy(ctx->sync_staging->ptr, data, size, &cpy_ctx->in_memcpys);
|
||||
ggml_vk_synchronize(ctx);
|
||||
}
|
||||
@@ -13555,7 +13575,7 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_
|
||||
buffer_cpy.dstOffset = 0;
|
||||
buffer_cpy.size = size;
|
||||
|
||||
compute_ctx->s->buffer.copyBuffer(buf->buffer, ctx->sync_staging->buffer, { buffer_cpy });
|
||||
compute_ctx->s->buffer->buf.copyBuffer(buf->buffer, ctx->sync_staging->buffer, { buffer_cpy });
|
||||
deferred_memcpy(data, ctx->sync_staging->ptr, size, &compute_ctx->out_memcpys);
|
||||
ggml_vk_synchronize(ctx);
|
||||
}
|
||||
@@ -13633,8 +13653,12 @@ static void ggml_vk_synchronize(ggml_backend_vk_context * ctx) {
|
||||
}
|
||||
|
||||
vk_context compute_ctx;
|
||||
vk_command_buffer* cmd_buf = nullptr;
|
||||
if (do_transfer) {
|
||||
compute_ctx = ctx->compute_ctx.lock();
|
||||
if (compute_ctx->s) {
|
||||
cmd_buf = compute_ctx->s->buffer;
|
||||
}
|
||||
|
||||
ggml_vk_ctx_end(compute_ctx);
|
||||
|
||||
@@ -13668,6 +13692,9 @@ static void ggml_vk_synchronize(ggml_backend_vk_context * ctx) {
|
||||
}
|
||||
ggml_vk_wait_for_fence(ctx);
|
||||
ctx->submit_pending = false;
|
||||
if (cmd_buf) {
|
||||
cmd_buf->in_use = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (do_transfer) {
|
||||
@@ -14157,7 +14184,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
GGML_ASSERT(ctx->compute_ctx.expired());
|
||||
compute_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
ctx->query_idx = 0;
|
||||
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
|
||||
compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
|
||||
}
|
||||
|
||||
ctx->prealloc_y_last_pipeline_used = nullptr;
|
||||
@@ -14393,7 +14420,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
// track a single node/fusion for the current query
|
||||
ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i];
|
||||
ctx->query_fusion_names[ctx->query_idx] = fusion_string;
|
||||
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
|
||||
compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
|
||||
} else {
|
||||
// track a fusion string and number of fused ops for the current node_idx
|
||||
ctx->query_fusion_names[i] = fusion_string;
|
||||
@@ -14726,6 +14753,7 @@ static void ggml_backend_vk_event_record(ggml_backend_t backend, ggml_backend_ev
|
||||
ggml_vk_submit_transfer_ctx(ctx);
|
||||
|
||||
vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
auto* cmd_buf = compute_ctx->s->buffer; // retrieve pointer before it gets reset
|
||||
|
||||
// the backend interface doesn't have an explicit reset, so reset it here
|
||||
// before we record the command to set it
|
||||
@@ -14738,6 +14766,7 @@ static void ggml_backend_vk_event_record(ggml_backend_t backend, ggml_backend_ev
|
||||
|
||||
ggml_vk_submit(compute_ctx, {vkev->fence});
|
||||
ctx->submit_pending = true;
|
||||
vkev->cmd_buffer = cmd_buf;
|
||||
ctx->compute_ctx.reset();
|
||||
}
|
||||
|
||||
@@ -15557,6 +15586,10 @@ static void ggml_backend_vk_device_event_synchronize(ggml_backend_dev_t dev, ggm
|
||||
vk_event *vkev = (vk_event *)event->context;
|
||||
|
||||
VK_CHECK(device->device.waitForFences({ vkev->fence }, true, UINT64_MAX), "event_synchronize");
|
||||
// Finished using current command buffer so we flag for reuse
|
||||
if (vkev->cmd_buffer) {
|
||||
vkev->cmd_buffer->in_use = false;
|
||||
}
|
||||
}
|
||||
|
||||
static vk_buffer ggml_vk_buffer_from_host_ptr(vk_device & device, void * ptr, size_t size) {
|
||||
|
||||
Reference in New Issue
Block a user