diff --git a/ggml/src/ggml-metal/ggml-metal-context.h b/ggml/src/ggml-metal/ggml-metal-context.h index bb6bf7da9b..f93a61cfea 100644 --- a/ggml/src/ggml-metal/ggml-metal-context.h +++ b/ggml/src/ggml-metal/ggml-metal-context.h @@ -25,6 +25,9 @@ void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * te enum ggml_status ggml_metal_graph_compute (ggml_metal_t ctx, struct ggml_cgraph * gf); void ggml_metal_graph_optimize(ggml_metal_t ctx, struct ggml_cgraph * gf); +void ggml_metal_event_record(ggml_metal_t ctx, ggml_metal_event_t ev); +void ggml_metal_event_wait (ggml_metal_t ctx, ggml_metal_event_t ev); + void ggml_metal_set_n_cb (ggml_metal_t ctx, int n_cb); void ggml_metal_set_abort_callback (ggml_metal_t ctx, ggml_abort_callback abort_callback, void * user_data); bool ggml_metal_supports_family (ggml_metal_t ctx, int family); diff --git a/ggml/src/ggml-metal/ggml-metal-context.m b/ggml/src/ggml-metal/ggml-metal-context.m index 13bc3b650d..0108471698 100644 --- a/ggml/src/ggml-metal/ggml-metal-context.m +++ b/ggml/src/ggml-metal/ggml-metal-context.m @@ -281,8 +281,8 @@ void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor, // wrap the source data into a Metal buffer id device = ggml_metal_device_get_obj(ctx->dev); id buf_src = [device newBufferWithBytes:data - length:size - options:MTLResourceStorageModeShared]; + length:size + options:MTLResourceStorageModeShared]; GGML_ASSERT(buf_src); @@ -324,9 +324,9 @@ void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * te @autoreleasepool { id device = ggml_metal_device_get_obj(ctx->dev); id buf_dst = [device newBufferWithBytesNoCopy:data - length:size - options:MTLResourceStorageModeShared - deallocator:nil]; + length:size + options:MTLResourceStorageModeShared + deallocator:nil]; GGML_ASSERT(buf_dst); @@ -538,6 +538,38 @@ void ggml_metal_graph_optimize(ggml_metal_t ctx, struct ggml_cgraph * gf) { //printf("%s: graph optimize took %.3f ms\n", __func__, (ggml_time_us() - t_start) / 1000.0); } +void ggml_metal_event_record(ggml_metal_t ctx, ggml_metal_event_t ev) { + @autoreleasepool { + id queue = ggml_metal_device_get_queue(ctx->dev); + id cmd_buf = [queue commandBuffer]; + + ggml_metal_event_encode_signal(ev, cmd_buf); + + [cmd_buf commit]; + + [ctx->cmd_bufs_ext addObject:cmd_buf]; + ctx->cmd_buf_last = cmd_buf; + + [cmd_buf retain]; + } +} + +void ggml_metal_event_wait(ggml_metal_t ctx, ggml_metal_event_t ev) { + @autoreleasepool { + id queue = ggml_metal_device_get_queue(ctx->dev); + id cmd_buf = [queue commandBuffer]; + + ggml_metal_event_encode_wait(ev, cmd_buf); + + [cmd_buf commit]; + + [ctx->cmd_bufs_ext addObject:cmd_buf]; + ctx->cmd_buf_last = cmd_buf; + + [cmd_buf retain]; + } +} + void ggml_metal_set_n_cb(ggml_metal_t ctx, int n_cb) { if (ctx->n_cb != n_cb) { ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_COMMAND_BUFFERS); diff --git a/ggml/src/ggml-metal/ggml-metal-device.h b/ggml/src/ggml-metal/ggml-metal-device.h index ca499d8195..df61be8a1c 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.h +++ b/ggml/src/ggml-metal/ggml-metal-device.h @@ -226,6 +226,11 @@ struct ggml_metal_device_props { int op_offload_min_batch_size; }; +typedef struct ggml_metal_event * ggml_metal_event_t; + +void ggml_metal_event_encode_signal(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf); +void ggml_metal_event_encode_wait (ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf); + ggml_metal_device_t ggml_metal_device_init(int device); void ggml_metal_device_free(ggml_metal_device_t dev); @@ -241,6 +246,10 @@ void ggml_metal_device_rsets_rm (ggml_metal_device_t dev, ggml_metal_rset_t rset void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev); +ggml_metal_event_t ggml_metal_device_event_new(ggml_metal_device_t dev); +void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev); +void ggml_metal_device_event_synchronize(ggml_metal_device_t dev, ggml_metal_event_t ev); + void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total); bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_tensor * op); diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m index 8f74fe4427..38f8c74261 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.m +++ b/ggml/src/ggml-metal/ggml-metal-device.m @@ -926,6 +926,59 @@ void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev) { atomic_store_explicit(&dev->rsets->d_loop, 2*dev->rsets->keep_alive_s, memory_order_relaxed); } +struct ggml_metal_event { + void * obj; // id + + atomic_int value; +}; + +void ggml_metal_event_encode_signal(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) { + id event = (id)ev->obj; + + id cmd_buf = (id) cmd_buf_raw; + + [cmd_buf encodeSignalEvent:event value:atomic_fetch_add_explicit(&ev->value, 1, memory_order_relaxed) + 1]; +} + +void ggml_metal_event_encode_wait(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) { + id event = (id)ev->obj; + + id cmd_buf = (id) cmd_buf_raw; + + [cmd_buf encodeWaitForEvent:event value:atomic_load_explicit(&ev->value, memory_order_relaxed)]; +} + +ggml_metal_event_t ggml_metal_device_event_new(ggml_metal_device_t dev) { + id event = [dev->mtl_device newEvent]; + + ggml_metal_event_t ev = calloc(1, sizeof(struct ggml_metal_event)); + + ev->obj = (__bridge void *)event; + ev->value = 0; + + return ev; +} + +void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev) { + id event = ev->obj; + [event release]; + + free(ev); + + GGML_UNUSED(dev); +} + +void ggml_metal_device_event_synchronize(ggml_metal_device_t dev, ggml_metal_event_t ev) { + @autoreleasepool { + id event = ev->obj; + + id cmd_buf = [dev->mtl_queue commandBuffer]; + [cmd_buf encodeWaitForEvent:event value:atomic_load_explicit(&ev->value, memory_order_relaxed)]; + [cmd_buf commit]; + [cmd_buf waitUntilCompleted]; + } +} + void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total) { if (@available(macOS 10.12, iOS 16.0, *)) { *total = dev->mtl_device.recommendedMaxWorkingSetSize; diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp index ee139ddec5..e421a4264b 100644 --- a/ggml/src/ggml-metal/ggml-metal.cpp +++ b/ggml/src/ggml-metal/ggml-metal.cpp @@ -510,6 +510,20 @@ static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, return ggml_metal_graph_compute(ctx, cgraph); } +static void ggml_backend_metal_event_record(ggml_backend_t backend, ggml_backend_event_t event) { + ggml_metal_t ctx = (ggml_metal_t)backend->context; + ggml_metal_event_t ev = (ggml_metal_event_t)event->context; + + ggml_metal_event_record(ctx, ev); +} + +static void ggml_backend_metal_event_wait(ggml_backend_t backend, ggml_backend_event_t event) { + ggml_metal_t ctx = (ggml_metal_t)backend->context; + ggml_metal_event_t ev = (ggml_metal_event_t)event->context; + + ggml_metal_event_wait(ctx, ev); +} + static void ggml_backend_metal_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_metal_t ctx = (ggml_metal_t)backend->context; @@ -536,12 +550,8 @@ static ggml_backend_i ggml_backend_metal_i = { /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_metal_graph_compute, - - // the events API is needed only for multi-GPU setups, so likely no need to implement it for Metal - // in any case, these docs seem relevant if we ever decide to implement it: - // https://developer.apple.com/documentation/metal/mtlcommandbuffer#Synchronizing-Passes-with-Events - /* .event_record = */ NULL, - /* .event_wait = */ NULL, + /* .event_record = */ ggml_backend_metal_event_record, + /* .event_wait = */ ggml_backend_metal_event_wait, /* .graph_optimize = */ ggml_backend_metal_graph_optimize, }; @@ -638,10 +648,10 @@ static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_bac ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total); props->caps = { - /* .async = */ true, - /* .host_buffer = */ false, - /* .buffer_from_host_ptr = */ true, - /* .events = */ false, + /* .async = */ true, + /* .host_buffer = */ false, + /* .buffer_from_host_ptr = */ true, + /* .events = */ true, }; } @@ -723,6 +733,38 @@ static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const g get_op_batch_size(op) >= ggml_metal_device_get_props(ctx_dev)->op_offload_min_batch_size; } +static ggml_backend_event_t ggml_backend_metal_device_event_new(ggml_backend_dev_t dev) { + ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context; + + ggml_metal_event_t event = ggml_metal_device_event_new(ctx_dev); + GGML_ASSERT(event); + + ggml_backend_event_t ev = new ggml_backend_event { + /* .device = */ dev, + /* .context = */ event, + }; + + return ev; +} + +static void ggml_backend_metal_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) { + ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context; + + ggml_metal_event_t ev = (ggml_metal_event_t)event->context; + + ggml_metal_device_event_free(ctx_dev, ev); + + delete event; +} + +static void ggml_backend_metal_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) { + ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context; + + ggml_metal_event_t evt = (ggml_metal_event_t)event->context; + + ggml_metal_device_event_synchronize(ctx_dev, evt); +} + static ggml_backend_device_i ggml_backend_metal_device_i = { /* .get_name = */ ggml_backend_metal_device_get_name, /* .get_description = */ ggml_backend_metal_device_get_description, @@ -736,9 +778,9 @@ static ggml_backend_device_i ggml_backend_metal_device_i = { /* .supports_op = */ ggml_backend_metal_device_supports_op, /* .supports_buft = */ ggml_backend_metal_device_supports_buft, /* .offload_op = */ ggml_backend_metal_device_offload_op, - /* .event_new = */ NULL, - /* .event_free = */ NULL, - /* .event_synchronize = */ NULL, + /* .event_new = */ ggml_backend_metal_device_event_new, + /* .event_free = */ ggml_backend_metal_device_event_free, + /* .event_synchronize = */ ggml_backend_metal_device_event_synchronize, }; // backend registry diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 10b306a853..203852d0f1 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -317,6 +317,7 @@ llama_context::llama_context( auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) { // ignore CPU backend + // TODO: should we ignore ACCEL types too? continue; } auto * dev = ggml_backend_get_device(backend.get());