|
|
|
|
@@ -48,6 +48,8 @@ static struct ggml_backend_metal_device_context {
|
|
|
|
|
int mtl_device_ref_count;
|
|
|
|
|
id<MTLLibrary> mtl_library;
|
|
|
|
|
|
|
|
|
|
id<MTLCommandQueue> mtl_queue;
|
|
|
|
|
|
|
|
|
|
NSLock * mtl_lock;
|
|
|
|
|
|
|
|
|
|
bool has_simdgroup_reduction;
|
|
|
|
|
@@ -69,6 +71,7 @@ static struct ggml_backend_metal_device_context {
|
|
|
|
|
/*.mtl_device =*/ nil,
|
|
|
|
|
/*.mtl_device_ref_count =*/ 0,
|
|
|
|
|
/*.mtl_library =*/ nil,
|
|
|
|
|
/*.mtl_queue =*/ nil,
|
|
|
|
|
/*.mtl_lock =*/ nil,
|
|
|
|
|
/*.has_simdgroup_reduction =*/ false,
|
|
|
|
|
/*.has_simdgroup_mm =*/ false,
|
|
|
|
|
@@ -94,6 +97,8 @@ static id<MTLDevice> ggml_backend_metal_device_acq(struct ggml_backend_metal_dev
|
|
|
|
|
ctx->mtl_device = MTLCreateSystemDefaultDevice();
|
|
|
|
|
|
|
|
|
|
if (ctx->mtl_device) {
|
|
|
|
|
ctx->mtl_queue = [ctx->mtl_device newCommandQueue];
|
|
|
|
|
|
|
|
|
|
ctx->has_simdgroup_reduction = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
|
|
|
|
|
ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
|
|
|
|
|
|
|
|
|
|
@@ -161,6 +166,11 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
|
|
|
|
|
ctx->mtl_library = nil;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (ctx->mtl_queue) {
|
|
|
|
|
[ctx->mtl_queue release];
|
|
|
|
|
ctx->mtl_queue = nil;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (ctx->mtl_device) {
|
|
|
|
|
[ctx->mtl_device release];
|
|
|
|
|
ctx->mtl_device = nil;
|
|
|
|
|
@@ -467,8 +477,6 @@ enum ggml_metal_kernel_type {
|
|
|
|
|
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
|
|
|
|
|
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC,
|
|
|
|
|
GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,
|
|
|
|
|
GGML_METAL_KERNEL_TYPE_SET_I32,
|
|
|
|
|
GGML_METAL_KERNEL_TYPE_SET_F32,
|
|
|
|
|
GGML_METAL_KERNEL_TYPE_CPY_F32_F32,
|
|
|
|
|
GGML_METAL_KERNEL_TYPE_CPY_F32_F16,
|
|
|
|
|
GGML_METAL_KERNEL_TYPE_CPY_F32_BF16,
|
|
|
|
|
@@ -803,6 +811,12 @@ struct ggml_backend_metal_context {
|
|
|
|
|
// n_cb command buffers + 1 used by the main thread
|
|
|
|
|
struct ggml_metal_command_buffer cmd_bufs[GGML_METAL_MAX_COMMAND_BUFFERS + 1];
|
|
|
|
|
|
|
|
|
|
// extra command buffers for things like getting, setting and copying tensors
|
|
|
|
|
NSMutableArray * cmd_bufs_ext;
|
|
|
|
|
|
|
|
|
|
id<MTLCommandBuffer> cmd_buf_last;
|
|
|
|
|
id<MTLCommandBuffer> cmd_buf_ext_last;
|
|
|
|
|
|
|
|
|
|
// abort ggml_metal_graph_compute if callback returns true
|
|
|
|
|
ggml_abort_callback abort_callback;
|
|
|
|
|
void * abort_callback_data;
|
|
|
|
|
@@ -999,7 +1013,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
|
|
|
|
GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
|
|
|
|
|
|
|
|
|
|
ctx->device = device;
|
|
|
|
|
ctx->queue = [device newCommandQueue];
|
|
|
|
|
ctx->queue = ctx_dev->mtl_queue;
|
|
|
|
|
if (ctx->queue == nil) {
|
|
|
|
|
GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
|
|
|
|
|
return NULL;
|
|
|
|
|
@@ -1073,6 +1087,11 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
|
|
|
|
ctx->cmd_bufs[i].mem_pool->device = device;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ctx->cmd_bufs_ext = [[NSMutableArray alloc] init];
|
|
|
|
|
|
|
|
|
|
ctx->cmd_buf_last = nil;
|
|
|
|
|
ctx->cmd_buf_ext_last = nil;
|
|
|
|
|
|
|
|
|
|
#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
|
|
|
|
|
if (@available(macOS 10.12, iOS 16.0, *)) {
|
|
|
|
|
GGML_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, device.recommendedMaxWorkingSetSize / 1e6);
|
|
|
|
|
@@ -1390,8 +1409,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
|
|
|
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
|
|
|
|
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, argsort_f32_i32_desc, true);
|
|
|
|
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true);
|
|
|
|
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_F32, set_f32, true);
|
|
|
|
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_I32, set_i32, true);
|
|
|
|
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true);
|
|
|
|
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true);
|
|
|
|
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_BF16, cpy_f32_bf16, use_bfloat);
|
|
|
|
|
@@ -1663,14 +1680,17 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
|
|
|
|
|
|
|
|
|
|
Block_release(ctx->encode_async);
|
|
|
|
|
|
|
|
|
|
[ctx->queue release];
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
|
|
|
|
|
// ctx->cmd_bufs[i].obj is auto released
|
|
|
|
|
if (ctx->cmd_bufs[i].obj) {
|
|
|
|
|
[ctx->cmd_bufs[i].obj release];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ggml_metal_mem_pool_free(ctx->cmd_bufs[i].mem_pool);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
[ctx->cmd_bufs_ext removeAllObjects];
|
|
|
|
|
[ctx->cmd_bufs_ext release];
|
|
|
|
|
|
|
|
|
|
dispatch_release(ctx->d_queue);
|
|
|
|
|
|
|
|
|
|
free(ctx);
|
|
|
|
|
@@ -1688,7 +1708,6 @@ struct ggml_backend_metal_buffer {
|
|
|
|
|
struct ggml_backend_metal_buffer_context {
|
|
|
|
|
void * all_data;
|
|
|
|
|
size_t all_size;
|
|
|
|
|
bool owned;
|
|
|
|
|
|
|
|
|
|
// multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
|
|
|
|
|
int n_buffers;
|
|
|
|
|
@@ -1696,6 +1715,9 @@ struct ggml_backend_metal_buffer_context {
|
|
|
|
|
|
|
|
|
|
// optional MTLResidencySet
|
|
|
|
|
id rset;
|
|
|
|
|
|
|
|
|
|
id device;
|
|
|
|
|
id queue;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// rset init
|
|
|
|
|
@@ -1761,7 +1783,7 @@ static void ggml_backend_metal_buffer_rset_free(struct ggml_backend_metal_buffer
|
|
|
|
|
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
|
|
|
|
|
// Metal buffer based on the host memory pointer
|
|
|
|
|
//
|
|
|
|
|
static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs) {
|
|
|
|
|
static id<MTLBuffer> ggml_metal_get_buffer(const struct ggml_tensor * t, size_t * offs) {
|
|
|
|
|
//GGML_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
|
|
|
|
|
|
|
|
|
|
const int64_t tsize = ggml_nbytes(t);
|
|
|
|
|
@@ -1984,16 +2006,6 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
|
|
|
|
return false;
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
case GGML_OP_SET:
|
|
|
|
|
{
|
|
|
|
|
switch (op->src[0]->type) {
|
|
|
|
|
case GGML_TYPE_F32:
|
|
|
|
|
case GGML_TYPE_I32:
|
|
|
|
|
return true;
|
|
|
|
|
default:
|
|
|
|
|
return false;
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
case GGML_OP_DIAG_MASK_INF:
|
|
|
|
|
case GGML_OP_GET_ROWS:
|
|
|
|
|
{
|
|
|
|
|
@@ -5569,68 +5581,6 @@ static int ggml_metal_encode_node(
|
|
|
|
|
|
|
|
|
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + nrptg - 1)/nrptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, nrptg, 1)];
|
|
|
|
|
} break;
|
|
|
|
|
case GGML_OP_SET:
|
|
|
|
|
{
|
|
|
|
|
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
|
|
|
|
GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
|
|
|
|
|
|
|
|
|
|
// src0 and dst as viewed during set
|
|
|
|
|
const size_t dst_nb0 = ggml_element_size(src0);
|
|
|
|
|
|
|
|
|
|
const size_t dst_nb1 = ((int32_t *) dst->op_params)[0];
|
|
|
|
|
const size_t dst_nb2 = ((int32_t *) dst->op_params)[1];
|
|
|
|
|
const size_t dst_nb3 = ((int32_t *) dst->op_params)[2];
|
|
|
|
|
const size_t offset = ((int32_t *) dst->op_params)[3];
|
|
|
|
|
const bool inplace = (bool) ((int32_t *) dst->op_params)[4];
|
|
|
|
|
|
|
|
|
|
if (!inplace) {
|
|
|
|
|
memcpy(((char *) dst->data), ((char *) src0->data), ggml_nbytes(dst));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const int im0 = (ne10 == 0 ? 0 : ne10-1);
|
|
|
|
|
const int im1 = (ne11 == 0 ? 0 : ne11-1);
|
|
|
|
|
const int im2 = (ne12 == 0 ? 0 : ne12-1);
|
|
|
|
|
const int im3 = (ne13 == 0 ? 0 : ne13-1);
|
|
|
|
|
|
|
|
|
|
GGML_ASSERT(offset + im0*dst_nb0 + im1*dst_nb1 + im2*dst_nb2 + im3*dst_nb3 <= ggml_nbytes(dst));
|
|
|
|
|
|
|
|
|
|
id<MTLComputePipelineState> pipeline = nil;
|
|
|
|
|
|
|
|
|
|
switch (src0t) {
|
|
|
|
|
case GGML_TYPE_F32:
|
|
|
|
|
GGML_ASSERT(nb10 == sizeof(float));
|
|
|
|
|
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_F32].pipeline; break;
|
|
|
|
|
case GGML_TYPE_I32:
|
|
|
|
|
GGML_ASSERT(nb10 == sizeof(int32_t));
|
|
|
|
|
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_I32].pipeline; break;
|
|
|
|
|
default: GGML_ABORT("fatal error");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ggml_metal_kargs_set args = {
|
|
|
|
|
/*.ne10 =*/ ne10,
|
|
|
|
|
/*.ne11 =*/ ne11,
|
|
|
|
|
/*.ne12 =*/ ne12,
|
|
|
|
|
/*.nb10 =*/ nb10,
|
|
|
|
|
/*.nb11 =*/ nb11,
|
|
|
|
|
/*.nb12 =*/ nb12,
|
|
|
|
|
/*.nb13 =*/ nb13,
|
|
|
|
|
/*.nb1 =*/ dst_nb1,
|
|
|
|
|
/*.nb2 =*/ dst_nb2,
|
|
|
|
|
/*.nb3 =*/ dst_nb3,
|
|
|
|
|
/*.offs =*/ offset,
|
|
|
|
|
/*.inplace =*/ inplace,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne10);
|
|
|
|
|
|
|
|
|
|
[encoder setComputePipelineState:pipeline];
|
|
|
|
|
[encoder setBytes:&args length:sizeof(args) atIndex:0];
|
|
|
|
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
|
|
|
|
|
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
|
|
|
|
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:3];
|
|
|
|
|
|
|
|
|
|
[encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
|
|
|
|
} break;
|
|
|
|
|
case GGML_OP_POOL_2D:
|
|
|
|
|
{
|
|
|
|
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
|
|
|
@@ -5781,78 +5731,122 @@ static enum ggml_status ggml_metal_graph_compute(
|
|
|
|
|
// the main thread commits the first few commands immediately
|
|
|
|
|
// cmd_buf[n_cb]
|
|
|
|
|
{
|
|
|
|
|
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
|
|
|
|
|
// first wait for any previous command buffer to be completed
|
|
|
|
|
// note: this checks only yhat the first part of the previous graph has been computed
|
|
|
|
|
// the rest of the graph might still be computing, but it is Ok to start queuing the beginning of the
|
|
|
|
|
/// new graph
|
|
|
|
|
if (ctx->cmd_bufs[n_cb].obj) {
|
|
|
|
|
[ctx->cmd_bufs[n_cb].obj waitUntilCompleted];
|
|
|
|
|
[ctx->cmd_bufs[n_cb].obj release];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
|
|
|
|
|
[cmd_buf retain];
|
|
|
|
|
|
|
|
|
|
ctx->cmd_bufs[n_cb].obj = cmd_buf;
|
|
|
|
|
|
|
|
|
|
[cmd_buf enqueue];
|
|
|
|
|
|
|
|
|
|
ctx->encode_async(n_cb);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// prepare the rest of the command buffers asynchronously
|
|
|
|
|
// here we guarantee the full previous graph has finished computing
|
|
|
|
|
// but note that we have already enqueued the first part of the new graph so it can start processing, while
|
|
|
|
|
// continue to encode the rest of the graph
|
|
|
|
|
if (ctx->cmd_buf_last) {
|
|
|
|
|
[ctx->cmd_buf_last waitUntilCompleted];
|
|
|
|
|
ctx->cmd_buf_last = nil;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// remember the command buffer for the next iteration
|
|
|
|
|
ctx->cmd_buf_last = ctx->cmd_bufs[n_cb].obj;
|
|
|
|
|
|
|
|
|
|
// prepare the rest of the command buffers asynchronously (optional)
|
|
|
|
|
// cmd_buf[0.. n_cb)
|
|
|
|
|
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
|
|
|
|
|
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
|
|
|
|
|
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
|
|
|
|
|
[cmd_buf retain];
|
|
|
|
|
|
|
|
|
|
if (ctx->cmd_bufs[cb_idx].obj) {
|
|
|
|
|
[ctx->cmd_bufs[cb_idx].obj release];
|
|
|
|
|
}
|
|
|
|
|
ctx->cmd_bufs[cb_idx].obj = cmd_buf;
|
|
|
|
|
|
|
|
|
|
// always enqueue the first two command buffers
|
|
|
|
|
// enqueue all of the command buffers if we don't need to abort
|
|
|
|
|
if (cb_idx < 2 || ctx->abort_callback == NULL) {
|
|
|
|
|
[cmd_buf enqueue];
|
|
|
|
|
|
|
|
|
|
// update the pointer to the last queued command buffer
|
|
|
|
|
// this is needed to implement synchronize()
|
|
|
|
|
ctx->cmd_buf_last = cmd_buf;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
dispatch_apply(n_cb, ctx->d_queue, ctx->encode_async);
|
|
|
|
|
|
|
|
|
|
// wait for completion and check status of each command buffer
|
|
|
|
|
// needed to detect if the device ran out-of-memory for example (#1881)
|
|
|
|
|
{
|
|
|
|
|
id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[n_cb].obj;
|
|
|
|
|
[cmd_buf waitUntilCompleted];
|
|
|
|
|
[ctx->cmd_buf_last waitUntilScheduled];
|
|
|
|
|
|
|
|
|
|
MTLCommandBufferStatus status = [cmd_buf status];
|
|
|
|
|
if (status != MTLCommandBufferStatusCompleted) {
|
|
|
|
|
GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status);
|
|
|
|
|
if (status == MTLCommandBufferStatusError) {
|
|
|
|
|
GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return GGML_STATUS_FAILED;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < n_cb; ++i) {
|
|
|
|
|
id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[i].obj;
|
|
|
|
|
[cmd_buf waitUntilCompleted];
|
|
|
|
|
|
|
|
|
|
MTLCommandBufferStatus status = [cmd_buf status];
|
|
|
|
|
if (status != MTLCommandBufferStatusCompleted) {
|
|
|
|
|
GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
|
|
|
|
if (status == MTLCommandBufferStatusError) {
|
|
|
|
|
GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return GGML_STATUS_FAILED;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
id<MTLCommandBuffer> next_buffer = (i + 1 < n_cb ? ctx->cmd_bufs[i + 1].obj : nil);
|
|
|
|
|
if (!next_buffer) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued);
|
|
|
|
|
if (next_queued) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) {
|
|
|
|
|
GGML_LOG_INFO("%s: command buffer %d aborted", __func__, i);
|
|
|
|
|
return GGML_STATUS_ABORTED;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
[next_buffer commit];
|
|
|
|
|
}
|
|
|
|
|
// for debugging: block until graph is computed
|
|
|
|
|
//[ctx->cmd_buf_last waitUntilCompleted];
|
|
|
|
|
|
|
|
|
|
// enter here only when capturing in order to wait for all computation to finish
|
|
|
|
|
// otherwise, we leave the graph to compute asynchronously
|
|
|
|
|
if (!should_capture && ctx->capture_started) {
|
|
|
|
|
// wait for completion and check status of each command buffer
|
|
|
|
|
// needed to detect if the device ran out-of-memory for example (#1881)
|
|
|
|
|
{
|
|
|
|
|
id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[n_cb].obj;
|
|
|
|
|
[cmd_buf waitUntilCompleted];
|
|
|
|
|
|
|
|
|
|
MTLCommandBufferStatus status = [cmd_buf status];
|
|
|
|
|
if (status != MTLCommandBufferStatusCompleted) {
|
|
|
|
|
GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status);
|
|
|
|
|
if (status == MTLCommandBufferStatusError) {
|
|
|
|
|
GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return GGML_STATUS_FAILED;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < n_cb; ++i) {
|
|
|
|
|
id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[i].obj;
|
|
|
|
|
[cmd_buf waitUntilCompleted];
|
|
|
|
|
|
|
|
|
|
MTLCommandBufferStatus status = [cmd_buf status];
|
|
|
|
|
if (status != MTLCommandBufferStatusCompleted) {
|
|
|
|
|
GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
|
|
|
|
if (status == MTLCommandBufferStatusError) {
|
|
|
|
|
GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return GGML_STATUS_FAILED;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
id<MTLCommandBuffer> next_buffer = (i + 1 < n_cb ? ctx->cmd_bufs[i + 1].obj : nil);
|
|
|
|
|
if (!next_buffer) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued);
|
|
|
|
|
if (next_queued) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) {
|
|
|
|
|
GGML_LOG_INFO("%s: command buffer %d aborted", __func__, i);
|
|
|
|
|
return GGML_STATUS_ABORTED;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
[next_buffer commit];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (ctx->cmd_buf_last) {
|
|
|
|
|
[ctx->cmd_buf_last waitUntilCompleted];
|
|
|
|
|
ctx->cmd_buf_last = nil;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
[ctx->capture_scope endScope];
|
|
|
|
|
[[MTLCaptureManager sharedCaptureManager] stopCapture];
|
|
|
|
|
}
|
|
|
|
|
@@ -5874,14 +5868,6 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
|
|
|
|
|
|
|
|
|
|
ggml_backend_metal_buffer_rset_free(ctx);
|
|
|
|
|
|
|
|
|
|
if (ctx->owned) {
|
|
|
|
|
#if TARGET_OS_OSX
|
|
|
|
|
vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ctx->all_data, ctx->all_size);
|
|
|
|
|
#else
|
|
|
|
|
free(ctx->all_data);
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
free(ctx);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@@ -5892,25 +5878,117 @@ static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
|
|
|
|
#if 1
|
|
|
|
|
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
|
|
|
|
|
|
|
|
|
@autoreleasepool {
|
|
|
|
|
id<MTLCommandQueue> queue = ctx->queue;
|
|
|
|
|
id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
|
|
|
|
|
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
|
|
|
|
|
[cmd_buf enqueue];
|
|
|
|
|
|
|
|
|
|
size_t buf_dst_offset = 0;
|
|
|
|
|
id<MTLBuffer> buf_dst = ggml_metal_get_buffer(tensor, &buf_dst_offset);
|
|
|
|
|
|
|
|
|
|
buf_dst_offset += offset;
|
|
|
|
|
|
|
|
|
|
[encoder fillBuffer:buf_dst
|
|
|
|
|
range:NSMakeRange(buf_dst_offset, buf_dst_offset + size)
|
|
|
|
|
value:value];
|
|
|
|
|
|
|
|
|
|
[encoder endEncoding];
|
|
|
|
|
|
|
|
|
|
[cmd_buf commit];
|
|
|
|
|
[cmd_buf waitUntilScheduled];
|
|
|
|
|
}
|
|
|
|
|
#else
|
|
|
|
|
memset((char *)tensor->data + offset, value, size);
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
GGML_UNUSED(buffer);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
|
|
|
#if 1
|
|
|
|
|
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
|
|
|
|
|
|
|
|
|
@autoreleasepool {
|
|
|
|
|
id<MTLCommandQueue> queue = ctx->queue;
|
|
|
|
|
id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
|
|
|
|
|
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
|
|
|
|
|
[cmd_buf enqueue];
|
|
|
|
|
|
|
|
|
|
// TODO: is this an extra copy? can we avoid it?
|
|
|
|
|
id<MTLBuffer> buf_src = [ctx->device newBufferWithBytesNoCopy:data
|
|
|
|
|
length:size
|
|
|
|
|
options:MTLResourceStorageModeShared
|
|
|
|
|
deallocator:nil];
|
|
|
|
|
|
|
|
|
|
size_t buf_dst_offset = 0;
|
|
|
|
|
id<MTLBuffer> buf_dst = ggml_metal_get_buffer(tensor, &buf_dst_offset);
|
|
|
|
|
|
|
|
|
|
buf_dst_offset += offset;
|
|
|
|
|
|
|
|
|
|
[encoder copyFromBuffer:buf_src
|
|
|
|
|
sourceOffset:0
|
|
|
|
|
toBuffer:buf_dst
|
|
|
|
|
destinationOffset:buf_dst_offset
|
|
|
|
|
size:size];
|
|
|
|
|
|
|
|
|
|
[encoder endEncoding];
|
|
|
|
|
|
|
|
|
|
// note: no need to wait for completion here
|
|
|
|
|
[cmd_buf commit];
|
|
|
|
|
[cmd_buf waitUntilScheduled];
|
|
|
|
|
}
|
|
|
|
|
#else
|
|
|
|
|
memcpy((char *)tensor->data + offset, data, size);
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
GGML_UNUSED(buffer);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
|
|
|
#if 1
|
|
|
|
|
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
|
|
|
|
|
|
|
|
|
@autoreleasepool {
|
|
|
|
|
id<MTLCommandQueue> queue = ctx->queue;
|
|
|
|
|
id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
|
|
|
|
|
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
|
|
|
|
|
[cmd_buf enqueue];
|
|
|
|
|
|
|
|
|
|
size_t buf_src_offset = 0;
|
|
|
|
|
id<MTLBuffer> buf_src = ggml_metal_get_buffer(tensor, &buf_src_offset);
|
|
|
|
|
|
|
|
|
|
buf_src_offset += offset;
|
|
|
|
|
|
|
|
|
|
id<MTLBuffer> buf_dst = [ctx->device newBufferWithBytesNoCopy:data
|
|
|
|
|
length:size
|
|
|
|
|
options:MTLResourceStorageModeShared
|
|
|
|
|
deallocator:nil];
|
|
|
|
|
|
|
|
|
|
[encoder copyFromBuffer:buf_src
|
|
|
|
|
sourceOffset:buf_src_offset
|
|
|
|
|
toBuffer:buf_dst
|
|
|
|
|
destinationOffset:0
|
|
|
|
|
size:size];
|
|
|
|
|
|
|
|
|
|
[encoder endEncoding];
|
|
|
|
|
|
|
|
|
|
[cmd_buf commit];
|
|
|
|
|
[cmd_buf waitUntilCompleted];
|
|
|
|
|
}
|
|
|
|
|
#else
|
|
|
|
|
memcpy(data, (const char *)tensor->data + offset, size);
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
GGML_UNUSED(buffer);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
|
|
|
|
if (ggml_backend_buffer_is_host(src->buffer)) {
|
|
|
|
|
GGML_ASSERT(false && "TODO");
|
|
|
|
|
memcpy(dst->data, src->data, ggml_nbytes(src));
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
@@ -5920,9 +5998,27 @@ static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, c
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
|
|
|
#if 1
|
|
|
|
|
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
|
|
|
|
|
|
|
|
|
@autoreleasepool {
|
|
|
|
|
id<MTLCommandQueue> queue = ctx->queue;
|
|
|
|
|
id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
|
|
|
|
|
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
|
|
|
|
|
[cmd_buf enqueue];
|
|
|
|
|
|
|
|
|
|
[encoder fillBuffer:ctx->buffers[0].metal
|
|
|
|
|
range:NSMakeRange(0, ctx->buffers[0].size)
|
|
|
|
|
value:value];
|
|
|
|
|
|
|
|
|
|
[encoder endEncoding];
|
|
|
|
|
|
|
|
|
|
[cmd_buf commit];
|
|
|
|
|
[cmd_buf waitUntilScheduled];
|
|
|
|
|
}
|
|
|
|
|
#else
|
|
|
|
|
memset(ctx->all_data, value, ctx->all_size);
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
|
|
|
|
|
@@ -5986,22 +6082,37 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
|
|
|
|
|
|
|
|
|
|
id<MTLDevice> device = ctx_dev->mtl_device;
|
|
|
|
|
|
|
|
|
|
#if 1
|
|
|
|
|
// we'll populate this after creating the Metal buffer below
|
|
|
|
|
ctx->all_data = (void *) 0x000000400ULL;
|
|
|
|
|
#else
|
|
|
|
|
ctx->all_data = ggml_metal_host_malloc(size_aligned);
|
|
|
|
|
#endif
|
|
|
|
|
ctx->all_size = size_aligned;
|
|
|
|
|
ctx->owned = true;
|
|
|
|
|
|
|
|
|
|
ctx->device = device;
|
|
|
|
|
ctx->queue = ctx_dev->mtl_queue;
|
|
|
|
|
|
|
|
|
|
ctx->n_buffers = 1;
|
|
|
|
|
|
|
|
|
|
if (ctx->all_data != NULL) {
|
|
|
|
|
ctx->buffers[0].data = ctx->all_data;
|
|
|
|
|
ctx->buffers[0].size = size;
|
|
|
|
|
ctx->buffers[0].metal = nil;
|
|
|
|
|
|
|
|
|
|
if (size_aligned > 0) {
|
|
|
|
|
#if 1
|
|
|
|
|
ctx->buffers[0].metal = [device newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate];
|
|
|
|
|
|
|
|
|
|
ctx->all_data = (void *) (ctx->buffers[0].metal.gpuAddress);
|
|
|
|
|
#else
|
|
|
|
|
ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
|
|
|
|
|
length:size_aligned
|
|
|
|
|
options:MTLResourceStorageModeShared
|
|
|
|
|
deallocator:nil];
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ctx->buffers[0].data = ctx->all_data;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) {
|
|
|
|
|
@@ -6047,7 +6158,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
|
|
|
|
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
|
|
|
|
/* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size,
|
|
|
|
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
|
|
|
|
/* .is_host = */ ggml_backend_metal_buffer_type_is_host,
|
|
|
|
|
/* .is_host = */ NULL,
|
|
|
|
|
},
|
|
|
|
|
/* .device = */ &g_ggml_backend_metal_device,
|
|
|
|
|
/* .context = */ NULL,
|
|
|
|
|
@@ -6063,6 +6174,8 @@ static const char * ggml_backend_metal_buffer_from_ptr_type_get_name(ggml_backen
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void) {
|
|
|
|
|
// note: not obvious, but this buffer type still needs to implement .alloc_buffer:
|
|
|
|
|
// https://github.com/ggml-org/llama.cpp/pull/15832#discussion_r2333177099
|
|
|
|
|
static struct ggml_backend_buffer_type ggml_backend_buffer_from_ptr_type_metal = {
|
|
|
|
|
/* .iface = */ {
|
|
|
|
|
/* .get_name = */ ggml_backend_metal_buffer_from_ptr_type_get_name,
|
|
|
|
|
@@ -6079,95 +6192,6 @@ static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void)
|
|
|
|
|
return &ggml_backend_buffer_from_ptr_type_metal;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// TODO: obsoleted by ggml_backend_metal_device_buffer_from_ptr
|
|
|
|
|
ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
|
|
|
|
|
struct ggml_backend_metal_buffer_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_buffer_context));
|
|
|
|
|
|
|
|
|
|
ctx->all_data = data;
|
|
|
|
|
ctx->all_size = size;
|
|
|
|
|
ctx->owned = false;
|
|
|
|
|
ctx->n_buffers = 0;
|
|
|
|
|
|
|
|
|
|
const size_t size_page = sysconf(_SC_PAGESIZE);
|
|
|
|
|
|
|
|
|
|
// page-align the data ptr
|
|
|
|
|
{
|
|
|
|
|
const uintptr_t offs = (uintptr_t) data % size_page;
|
|
|
|
|
data = (void *) ((char *) data - offs);
|
|
|
|
|
size += offs;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t size_aligned = size;
|
|
|
|
|
if ((size_aligned % size_page) != 0) {
|
|
|
|
|
size_aligned += (size_page - (size_aligned % size_page));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
struct ggml_backend_metal_device_context * ctx_dev = &g_ggml_ctx_dev_main;
|
|
|
|
|
|
|
|
|
|
GGML_ASSERT(ctx_dev->mtl_device != nil);
|
|
|
|
|
|
|
|
|
|
id<MTLDevice> device = ctx_dev->mtl_device;
|
|
|
|
|
|
|
|
|
|
// the buffer fits into the max buffer size allowed by the device
|
|
|
|
|
if (size_aligned <= device.maxBufferLength) {
|
|
|
|
|
ctx->buffers[ctx->n_buffers].data = data;
|
|
|
|
|
ctx->buffers[ctx->n_buffers].size = size;
|
|
|
|
|
ctx->buffers[ctx->n_buffers].metal = nil;
|
|
|
|
|
|
|
|
|
|
if (size_aligned > 0) {
|
|
|
|
|
ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
|
|
|
|
|
|
|
|
|
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
|
|
|
|
GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ggml_backend_metal_log_allocated_size(device, size_aligned);
|
|
|
|
|
|
|
|
|
|
++ctx->n_buffers;
|
|
|
|
|
} else {
|
|
|
|
|
// this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
|
|
|
|
|
// one of the views
|
|
|
|
|
const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
|
|
|
|
|
const size_t size_step = device.maxBufferLength - size_ovlp;
|
|
|
|
|
const size_t size_view = device.maxBufferLength;
|
|
|
|
|
|
|
|
|
|
for (size_t i = 0; i < size; i += size_step) {
|
|
|
|
|
const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
|
|
|
|
|
|
|
|
|
|
ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
|
|
|
|
|
ctx->buffers[ctx->n_buffers].size = size_step_aligned;
|
|
|
|
|
ctx->buffers[ctx->n_buffers].metal = nil;
|
|
|
|
|
|
|
|
|
|
if (size_step_aligned > 0) {
|
|
|
|
|
ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
|
|
|
|
|
|
|
|
|
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
|
|
|
|
GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ggml_backend_metal_log_allocated_size(device, size_step_aligned);
|
|
|
|
|
|
|
|
|
|
if (i + size_step < size) {
|
|
|
|
|
GGML_LOG_INFO("\n");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
++ctx->n_buffers;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
|
|
|
|
|
GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
|
|
|
|
|
free(ctx);
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// backend
|
|
|
|
|
|
|
|
|
|
static const char * ggml_backend_metal_name(ggml_backend_t backend) {
|
|
|
|
|
@@ -6184,6 +6208,147 @@ static void ggml_backend_metal_free(ggml_backend_t backend) {
|
|
|
|
|
free(backend);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ggml_backend_metal_synchronize(ggml_backend_t backend) {
|
|
|
|
|
struct ggml_backend_metal_context * ctx = backend->context;
|
|
|
|
|
|
|
|
|
|
// wait for the computation of the graph to finish
|
|
|
|
|
if (ctx->cmd_buf_last) {
|
|
|
|
|
[ctx->cmd_buf_last waitUntilCompleted];
|
|
|
|
|
ctx->cmd_buf_last = nil;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// wait for any pending async get/set operations
|
|
|
|
|
if (ctx->cmd_buf_ext_last) {
|
|
|
|
|
[ctx->cmd_buf_ext_last waitUntilCompleted];
|
|
|
|
|
ctx->cmd_buf_ext_last = nil;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// release any completed command buffers
|
|
|
|
|
if (ctx->cmd_bufs_ext.count > 0) {
|
|
|
|
|
for (size_t i = 0; i < ctx->cmd_bufs_ext.count; ++i) {
|
|
|
|
|
id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs_ext[i];
|
|
|
|
|
|
|
|
|
|
MTLCommandBufferStatus status = [cmd_buf status];
|
|
|
|
|
if (status != MTLCommandBufferStatusCompleted) {
|
|
|
|
|
GGML_LOG_ERROR("%s: error: command buffer %d failed with status %d\n", __func__, (int) i, (int) status);
|
|
|
|
|
if (status == MTLCommandBufferStatusError) {
|
|
|
|
|
GGML_LOG_ERROR("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
|
|
|
|
|
}
|
|
|
|
|
GGML_ABORT("fatal error");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
[cmd_buf release];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
[ctx->cmd_bufs_ext removeAllObjects];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
|
|
|
struct ggml_backend_metal_context * ctx = backend->context;
|
|
|
|
|
struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
|
|
|
|
|
|
|
|
|
|
@autoreleasepool {
|
|
|
|
|
id<MTLDevice> device = ctx_dev->mtl_device;
|
|
|
|
|
|
|
|
|
|
// wrap the source data into a Metal buffer
|
|
|
|
|
id<MTLBuffer> buf_src = [device newBufferWithBytes:data
|
|
|
|
|
length:size
|
|
|
|
|
options:MTLResourceStorageModeShared];
|
|
|
|
|
|
|
|
|
|
size_t buf_dst_offset = 0;
|
|
|
|
|
id<MTLBuffer> buf_dst = ggml_metal_get_buffer(tensor, &buf_dst_offset);
|
|
|
|
|
|
|
|
|
|
if (buf_dst == nil) {
|
|
|
|
|
GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
buf_dst_offset += offset;
|
|
|
|
|
|
|
|
|
|
// queue the copy operation into the queue of the Metal context
|
|
|
|
|
// this will be queued at the end, after any currently ongoing GPU operations
|
|
|
|
|
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
|
|
|
|
|
[cmd_buf enqueue];
|
|
|
|
|
|
|
|
|
|
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
|
|
|
|
|
|
|
|
|
|
[encoder copyFromBuffer:buf_src
|
|
|
|
|
sourceOffset:0
|
|
|
|
|
toBuffer:buf_dst
|
|
|
|
|
destinationOffset:buf_dst_offset
|
|
|
|
|
size:size];
|
|
|
|
|
|
|
|
|
|
[encoder endEncoding];
|
|
|
|
|
[cmd_buf commit];
|
|
|
|
|
|
|
|
|
|
// do not wait here for completion
|
|
|
|
|
//[cmd_buf waitUntilCompleted];
|
|
|
|
|
|
|
|
|
|
// instead, remember a reference to the command buffer and wait for it later if needed
|
|
|
|
|
[ctx->cmd_bufs_ext addObject:cmd_buf];
|
|
|
|
|
ctx->cmd_buf_ext_last = cmd_buf;
|
|
|
|
|
|
|
|
|
|
[cmd_buf retain];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
|
|
|
struct ggml_backend_metal_context * ctx = backend->context;
|
|
|
|
|
struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
|
|
|
|
|
|
|
|
|
|
@autoreleasepool {
|
|
|
|
|
id<MTLDevice> device = ctx_dev->mtl_device;
|
|
|
|
|
|
|
|
|
|
id<MTLBuffer> buf_dst = [device newBufferWithBytesNoCopy:data
|
|
|
|
|
length:size
|
|
|
|
|
options:MTLResourceStorageModeShared
|
|
|
|
|
deallocator:nil];
|
|
|
|
|
|
|
|
|
|
size_t buf_src_offset = 0;
|
|
|
|
|
id<MTLBuffer> buf_src = ggml_metal_get_buffer(tensor, &buf_src_offset);
|
|
|
|
|
|
|
|
|
|
if (buf_src == nil) {
|
|
|
|
|
GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
buf_src_offset += offset;
|
|
|
|
|
|
|
|
|
|
// queue the copy operation into the queue of the Metal context
|
|
|
|
|
// this will be queued at the end, after any currently ongoing GPU operations
|
|
|
|
|
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
|
|
|
|
|
[cmd_buf enqueue];
|
|
|
|
|
|
|
|
|
|
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
|
|
|
|
|
|
|
|
|
|
[encoder copyFromBuffer:buf_src
|
|
|
|
|
sourceOffset:buf_src_offset
|
|
|
|
|
toBuffer:buf_dst
|
|
|
|
|
destinationOffset:0
|
|
|
|
|
size:size];
|
|
|
|
|
|
|
|
|
|
[encoder endEncoding];
|
|
|
|
|
[cmd_buf commit];
|
|
|
|
|
|
|
|
|
|
// do not wait here for completion
|
|
|
|
|
//[cmd_buf waitUntilCompleted];
|
|
|
|
|
|
|
|
|
|
// instead, remember a reference to the command buffer and wait for it later if needed
|
|
|
|
|
[ctx->cmd_bufs_ext addObject:cmd_buf];
|
|
|
|
|
ctx->cmd_buf_ext_last = cmd_buf;
|
|
|
|
|
|
|
|
|
|
[cmd_buf retain];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static bool ggml_backend_metal_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
GGML_UNUSED(backend_src);
|
|
|
|
|
GGML_UNUSED(backend_dst);
|
|
|
|
|
GGML_UNUSED(src);
|
|
|
|
|
GGML_UNUSED(dst);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
|
|
|
|
return ggml_metal_graph_compute(backend, cgraph);
|
|
|
|
|
}
|
|
|
|
|
@@ -6214,7 +6379,10 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
|
|
|
|
|
|
|
|
|
|
const int n_nodes_per_cb = ctx->n_nodes_per_cb;
|
|
|
|
|
|
|
|
|
|
id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[cb_idx].obj;
|
|
|
|
|
id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[cb_idx].obj;
|
|
|
|
|
struct ggml_metal_mem_pool * mem_pool = ctx->cmd_bufs[cb_idx].mem_pool;
|
|
|
|
|
|
|
|
|
|
ggml_metal_mem_pool_reset(mem_pool);
|
|
|
|
|
|
|
|
|
|
id<MTLComputeCommandEncoder> encoder = [cmd_buf computeCommandEncoder];
|
|
|
|
|
|
|
|
|
|
@@ -6228,9 +6396,6 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
|
|
|
|
|
|
|
|
|
|
const bool should_capture = ctx->capture_next_compute;
|
|
|
|
|
|
|
|
|
|
struct ggml_metal_mem_pool * mem_pool = ctx->cmd_bufs[cb_idx].mem_pool;
|
|
|
|
|
ggml_metal_mem_pool_reset(mem_pool);
|
|
|
|
|
|
|
|
|
|
for (int idx = node_start; idx < node_end;) {
|
|
|
|
|
if (should_capture) {
|
|
|
|
|
[encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
|
|
|
|
|
@@ -6264,15 +6429,19 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
|
|
|
|
|
static struct ggml_backend_i ggml_backend_metal_i = {
|
|
|
|
|
/* .get_name = */ ggml_backend_metal_name,
|
|
|
|
|
/* .free = */ ggml_backend_metal_free,
|
|
|
|
|
/* .set_tensor_async = */ NULL,
|
|
|
|
|
/* .get_tensor_async = */ NULL,
|
|
|
|
|
/* .cpy_tensor_async = */ NULL,
|
|
|
|
|
/* .synchronize = */ NULL,
|
|
|
|
|
/* .set_tensor_async = */ ggml_backend_metal_set_tensor_async,
|
|
|
|
|
/* .get_tensor_async = */ ggml_backend_metal_get_tensor_async,
|
|
|
|
|
/* .cpy_tensor_async = */ ggml_backend_metal_cpy_tensor_async, // only needed for multi-GPU setups
|
|
|
|
|
/* .synchronize = */ ggml_backend_metal_synchronize,
|
|
|
|
|
/* .graph_plan_create = */ NULL,
|
|
|
|
|
/* .graph_plan_free = */ NULL,
|
|
|
|
|
/* .graph_plan_update = */ NULL,
|
|
|
|
|
/* .graph_plan_compute = */ NULL,
|
|
|
|
|
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
|
|
|
|
|
|
|
|
|
// the events API is needed only for multi-GPU setups, so likely no need to implement it for Metal
|
|
|
|
|
// in any case, these docs seem relevant if we ever decide to implement it:
|
|
|
|
|
// https://developer.apple.com/documentation/metal/mtlcommandbuffer#Synchronizing-Passes-with-Events
|
|
|
|
|
/* .event_record = */ NULL,
|
|
|
|
|
/* .event_wait = */ NULL,
|
|
|
|
|
/* .optimize_graph = */ NULL,
|
|
|
|
|
@@ -6376,8 +6545,8 @@ static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct g
|
|
|
|
|
props->type = ggml_backend_metal_device_get_type(dev);
|
|
|
|
|
ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
|
|
|
|
props->caps = (struct ggml_backend_dev_caps) {
|
|
|
|
|
/* .async = */ false,
|
|
|
|
|
/* .host_buffer = */ false,
|
|
|
|
|
/* .async = */ true,
|
|
|
|
|
/* .host_buffer = */ true,
|
|
|
|
|
/* .buffer_from_host_ptr = */ true,
|
|
|
|
|
/* .events = */ false,
|
|
|
|
|
};
|
|
|
|
|
@@ -6417,7 +6586,7 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back
|
|
|
|
|
|
|
|
|
|
ctx->all_data = ptr;
|
|
|
|
|
ctx->all_size = size;
|
|
|
|
|
ctx->owned = false;
|
|
|
|
|
|
|
|
|
|
ctx->n_buffers = 0;
|
|
|
|
|
|
|
|
|
|
const size_t size_page = sysconf(_SC_PAGESIZE);
|
|
|
|
|
@@ -6440,6 +6609,9 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back
|
|
|
|
|
|
|
|
|
|
id<MTLDevice> device = ctx_dev->mtl_device;
|
|
|
|
|
|
|
|
|
|
ctx->device = device;
|
|
|
|
|
ctx->queue = ctx_dev->mtl_queue;
|
|
|
|
|
|
|
|
|
|
// the buffer fits into the max buffer size allowed by the device
|
|
|
|
|
if (size_aligned <= device.maxBufferLength) {
|
|
|
|
|
ctx->buffers[ctx->n_buffers].data = ptr;
|
|
|
|
|
@@ -6514,8 +6686,23 @@ static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml
|
|
|
|
|
GGML_UNUSED(dev);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int64_t get_op_batch_size(const struct ggml_tensor * op) {
|
|
|
|
|
switch (op->op) {
|
|
|
|
|
case GGML_OP_MUL_MAT:
|
|
|
|
|
return op->ne[1];
|
|
|
|
|
case GGML_OP_MUL_MAT_ID:
|
|
|
|
|
return op->ne[2];
|
|
|
|
|
default:
|
|
|
|
|
return ggml_nrows(op);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
|
|
|
|
return false;
|
|
|
|
|
const int min_batch_size = 32;
|
|
|
|
|
|
|
|
|
|
return (op->op == GGML_OP_MUL_MAT ||
|
|
|
|
|
op->op == GGML_OP_MUL_MAT_ID) &&
|
|
|
|
|
get_op_batch_size(op) >= min_batch_size;
|
|
|
|
|
|
|
|
|
|
GGML_UNUSED(dev);
|
|
|
|
|
GGML_UNUSED(op);
|
|
|
|
|
|