From 2976dd80f25006f69d593134fe10dfe601dfec2a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 31 Jan 2026 14:04:34 +0200 Subject: [PATCH] cont : implement cpy_tensor_async --- ggml/src/ggml-metal/ggml-metal-context.h | 3 ++ ggml/src/ggml-metal/ggml-metal-context.m | 53 ++++++++++++++++++++++++ ggml/src/ggml-metal/ggml-metal-device.h | 2 +- ggml/src/ggml-metal/ggml-metal-device.m | 2 +- ggml/src/ggml-metal/ggml-metal.cpp | 29 ++++++++++--- 5 files changed, 81 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal-context.h b/ggml/src/ggml-metal/ggml-metal-context.h index f93a61cfea..abf4b06ed2 100644 --- a/ggml/src/ggml-metal/ggml-metal-context.h +++ b/ggml/src/ggml-metal/ggml-metal-context.h @@ -21,6 +21,7 @@ void ggml_metal_synchronize(ggml_metal_t ctx); void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); +bool ggml_metal_cpy_tensor_async(ggml_metal_t ctx_src, ggml_metal_t ctx_dst, const struct ggml_tensor * src, struct ggml_tensor * dst); enum ggml_status ggml_metal_graph_compute (ggml_metal_t ctx, struct ggml_cgraph * gf); void ggml_metal_graph_optimize(ggml_metal_t ctx, struct ggml_cgraph * gf); @@ -28,6 +29,8 @@ void ggml_metal_graph_optimize(ggml_metal_t ctx, struct ggml_cgraph void ggml_metal_event_record(ggml_metal_t ctx, ggml_metal_event_t ev); void ggml_metal_event_wait (ggml_metal_t ctx, ggml_metal_event_t ev); +ggml_metal_event_t ggml_metal_get_ev_cpy(ggml_metal_t ctx); + void ggml_metal_set_n_cb (ggml_metal_t ctx, int n_cb); void ggml_metal_set_abort_callback (ggml_metal_t ctx, ggml_abort_callback abort_callback, void * user_data); bool ggml_metal_supports_family (ggml_metal_t ctx, int family); diff --git a/ggml/src/ggml-metal/ggml-metal-context.m b/ggml/src/ggml-metal/ggml-metal-context.m index 0108471698..a412d70aed 100644 --- a/ggml/src/ggml-metal/ggml-metal-context.m +++ b/ggml/src/ggml-metal/ggml-metal-context.m @@ -29,6 +29,8 @@ struct ggml_metal { ggml_metal_device_t dev; ggml_metal_library_t lib; + ggml_metal_event_t ev_cpy; // for async copies + dispatch_queue_t d_queue; // additional, inference-time compiled pipelines @@ -119,6 +121,8 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) { } } + res->ev_cpy = ggml_metal_device_event_init(dev); + const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev); snprintf(res->name, sizeof(res->name), "%s", props_dev->name); @@ -210,6 +214,8 @@ void ggml_metal_free(ggml_metal_t ctx) { dispatch_release(ctx->d_queue); + ggml_metal_device_event_free(ctx->dev, ctx->ev_cpy); + free(ctx); } @@ -364,6 +370,49 @@ void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * te } } +bool ggml_metal_cpy_tensor_async(ggml_metal_t ctx_src, ggml_metal_t ctx_dst, const struct ggml_tensor * src, struct ggml_tensor * dst) { + @autoreleasepool { + struct ggml_metal_buffer_id bid_src = ggml_metal_get_buffer_id(src); + struct ggml_metal_buffer_id bid_dst = ggml_metal_get_buffer_id(dst); + + if (bid_src.metal == nil || bid_dst.metal == nil) { + return false; + } + + // queue the copy operation into the Metal context + // this will be queued at the end, after any currently ongoing GPU operations + id queue = ggml_metal_device_get_queue(ctx_src->dev); + id cmd_buf = [queue commandBuffer]; + id encoder = [cmd_buf blitCommandEncoder]; + + [encoder copyFromBuffer:bid_src.metal + sourceOffset:bid_src.offs + toBuffer:bid_dst.metal + destinationOffset:bid_dst.offs + size:ggml_nbytes(src)]; + + [encoder endEncoding]; + + ggml_metal_event_t ev_cpy = ggml_metal_get_ev_cpy(ctx_src); + ggml_metal_event_record(ctx_src, ev_cpy); + + [cmd_buf commit]; + + // do not wait here for completion + //[cmd_buf waitUntilCompleted]; + + // instead, remember a reference to the command buffer and wait for it later if needed + [ctx_src->cmd_bufs_ext addObject:cmd_buf]; + ctx_src->cmd_buf_last = cmd_buf; + + [cmd_buf retain]; + + ggml_metal_event_wait(ctx_dst, ev_cpy); + + return true; + } +} + enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph * gf) { // number of nodes encoded by the main thread (empirically determined) const int n_main = 64; @@ -570,6 +619,10 @@ void ggml_metal_event_wait(ggml_metal_t ctx, ggml_metal_event_t ev) { } } +ggml_metal_event_t ggml_metal_get_ev_cpy(ggml_metal_t ctx) { + return ctx->ev_cpy; +} + void ggml_metal_set_n_cb(ggml_metal_t ctx, int n_cb) { if (ctx->n_cb != n_cb) { ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_COMMAND_BUFFERS); diff --git a/ggml/src/ggml-metal/ggml-metal-device.h b/ggml/src/ggml-metal/ggml-metal-device.h index df61be8a1c..afb091e725 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.h +++ b/ggml/src/ggml-metal/ggml-metal-device.h @@ -246,7 +246,7 @@ void ggml_metal_device_rsets_rm (ggml_metal_device_t dev, ggml_metal_rset_t rset void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev); -ggml_metal_event_t ggml_metal_device_event_new(ggml_metal_device_t dev); +ggml_metal_event_t ggml_metal_device_event_init(ggml_metal_device_t dev); void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev); void ggml_metal_device_event_synchronize(ggml_metal_device_t dev, ggml_metal_event_t ev); diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m index 38f8c74261..285dd1630e 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.m +++ b/ggml/src/ggml-metal/ggml-metal-device.m @@ -948,7 +948,7 @@ void ggml_metal_event_encode_wait(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cm [cmd_buf encodeWaitForEvent:event value:atomic_load_explicit(&ev->value, memory_order_relaxed)]; } -ggml_metal_event_t ggml_metal_device_event_new(ggml_metal_device_t dev) { +ggml_metal_event_t ggml_metal_device_event_init(ggml_metal_device_t dev) { id event = [dev->mtl_device newEvent]; ggml_metal_event_t ev = calloc(1, sizeof(struct ggml_metal_event)); diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp index e421a4264b..a616dcdb46 100644 --- a/ggml/src/ggml-metal/ggml-metal.cpp +++ b/ggml/src/ggml-metal/ggml-metal.cpp @@ -166,6 +166,11 @@ static ggml_backend_buffer_i ggml_backend_metal_buffer_private_i = { /* .reset = */ NULL, }; +static bool ggml_backend_buffer_is_metal(ggml_backend_buffer_t buffer) { + return buffer->iface.free_buffer == ggml_backend_metal_buffer_shared_free_buffer || + buffer->iface.free_buffer == ggml_backend_metal_buffer_private_free_buffer; +} + // // buffer types // @@ -496,12 +501,24 @@ static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const gg } static bool ggml_backend_metal_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) { - return false; + if (!ggml_backend_is_metal(backend_src) || !ggml_backend_is_metal(backend_dst)) { + return false; + } - GGML_UNUSED(backend_src); - GGML_UNUSED(backend_dst); - GGML_UNUSED(src); - GGML_UNUSED(dst); + if (!ggml_backend_buffer_is_metal(src->buffer) || !ggml_backend_buffer_is_metal(dst->buffer)) { + return false; + } + + ggml_metal_t ctx_src = (ggml_metal_t)backend_src->context; + ggml_metal_t ctx_dst = (ggml_metal_t)backend_dst->context; + + //ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer; + //ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer; + + //ggml_metal_buffer_t buf_ctx_src = (ggml_metal_buffer_t)buf_src->context; + //ggml_metal_buffer_t buf_ctx_dst = (ggml_metal_buffer_t)buf_dst->context; + + return ggml_metal_cpy_tensor_async(ctx_src, ctx_dst, src, dst); } static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { @@ -736,7 +753,7 @@ static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const g static ggml_backend_event_t ggml_backend_metal_device_event_new(ggml_backend_dev_t dev) { ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context; - ggml_metal_event_t event = ggml_metal_device_event_new(ctx_dev); + ggml_metal_event_t event = ggml_metal_device_event_init(ctx_dev); GGML_ASSERT(event); ggml_backend_event_t ev = new ggml_backend_event {