diff --git a/ggml/src/ggml-metal/ggml-metal-context.m b/ggml/src/ggml-metal/ggml-metal-context.m index 5d3a8ce412..0d0dadc3ff 100644 --- a/ggml/src/ggml-metal/ggml-metal-context.m +++ b/ggml/src/ggml-metal/ggml-metal-context.m @@ -129,8 +129,16 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) { res->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT); - res->use_fusion = getenv("GGML_METAL_FUSION_DISABLE") == nil; - res->use_concurrency = getenv("GGML_METAL_CONCURRENCY_DISABLE") == nil; + res->use_fusion = getenv("GGML_METAL_FUSION_DISABLE") == nil; + + // Auto-disable concurrent dispatch on non-Apple GPUs (AMD/Intel) + // MTLDispatchTypeConcurrent has broken memory barriers on these GPUs + bool is_apple_gpu = props_dev->supports_gpu_family_apple7; + res->use_concurrency = is_apple_gpu && (getenv("GGML_METAL_CONCURRENCY_DISABLE") == nil); + + if (!is_apple_gpu && getenv("GGML_METAL_CONCURRENCY_DISABLE") == nil) { + GGML_LOG_INFO("%s: disabling concurrent dispatch (non-Apple GPU detected)\n", __func__); + } { const char * val = getenv("GGML_METAL_GRAPH_DEBUG"); diff --git a/ggml/src/ggml-metal/ggml-metal-device.h b/ggml/src/ggml-metal/ggml-metal-device.h index 93d7f6a216..856761b818 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.h +++ b/ggml/src/ggml-metal/ggml-metal-device.h @@ -226,6 +226,7 @@ struct ggml_metal_device_props { bool has_tensor; bool use_residency_sets; bool use_shared_buffers; + bool use_managed_buffers; // Use Managed mode for discrete GPUs bool supports_gpu_family_apple7; diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m index 3db7f12629..5d67b9a67e 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.m +++ b/ggml/src/ggml-metal/ggml-metal-device.m @@ -787,6 +787,17 @@ ggml_metal_device_t ggml_metal_device_init(int device) { dev->props.use_shared_buffers = true; } + // Use Managed mode on discrete GPUs for cached PCIe reads + dev->props.use_managed_buffers = !dev->props.has_unified_memory; + + // Environment variable overrides for testing + if (getenv("GGML_METAL_MANAGED_BUFFERS_DISABLE") != NULL) { + dev->props.use_managed_buffers = false; + } + if (getenv("GGML_METAL_MANAGED_BUFFERS_ENABLE") != NULL) { + dev->props.use_managed_buffers = true; + } + dev->props.supports_gpu_family_apple7 = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7]; dev->props.op_offload_min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32; @@ -849,6 +860,7 @@ ggml_metal_device_t ggml_metal_device_init(int device) { GGML_LOG_INFO("%s: has tensor = %s\n", __func__, dev->props.has_tensor ? "true" : "false"); GGML_LOG_INFO("%s: use residency sets = %s\n", __func__, dev->props.use_residency_sets ? "true" : "false"); GGML_LOG_INFO("%s: use shared buffers = %s\n", __func__, dev->props.use_shared_buffers ? "true" : "false"); + GGML_LOG_INFO("%s: use managed buffers = %s\n", __func__, dev->props.use_managed_buffers ? "true" : "false"); #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) if (@available(macOS 10.12, iOS 16.0, *)) { @@ -1423,10 +1435,19 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size, if (size_aligned > 0) { if (props_dev->use_shared_buffers && shared) { + MTLResourceOptions storage_mode = props_dev->use_managed_buffers + ? MTLResourceStorageModeManaged + : MTLResourceStorageModeShared; + res->buffers[0].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:res->all_data length:size_aligned - options:MTLResourceStorageModeShared + options:storage_mode deallocator:nil]; + + // For Managed buffers, sync CPU→GPU after creation + if (props_dev->use_managed_buffers && res->buffers[0].metal) { + [(id)res->buffers[0].metal didModifyRange:NSMakeRange(0, size_aligned)]; + } } else { res->buffers[0].metal = [res->dev->mtl_device newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate]; } @@ -1492,13 +1513,22 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s res->buffers[res->n_buffers].metal = nil; if (size_aligned > 0) { - res->buffers[res->n_buffers].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:ptr length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; + MTLResourceOptions storage_mode = props_dev->use_managed_buffers + ? MTLResourceStorageModeManaged + : MTLResourceStorageModeShared; + + res->buffers[res->n_buffers].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:ptr length:size_aligned options:storage_mode deallocator:nil]; if (res->buffers[res->n_buffers].metal == nil) { GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); free(res); return NULL; } + + // For Managed buffers, sync CPU→GPU after creation + if (props_dev->use_managed_buffers) { + [(id)res->buffers[res->n_buffers].metal didModifyRange:NSMakeRange(0, size_aligned)]; + } } ggml_metal_log_allocated_size(res->dev->mtl_device, size_aligned); @@ -1519,13 +1549,22 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s res->buffers[res->n_buffers].metal = nil; if (size_step_aligned > 0) { - res->buffers[res->n_buffers].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:(void *) ((uint8_t *) ptr + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; + MTLResourceOptions storage_mode = props_dev->use_managed_buffers + ? MTLResourceStorageModeManaged + : MTLResourceStorageModeShared; + + res->buffers[res->n_buffers].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:(void *) ((uint8_t *) ptr + i) length:size_step_aligned options:storage_mode deallocator:nil]; if (res->buffers[res->n_buffers].metal == nil) { GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0); free(res); return NULL; } + + // For Managed buffers, sync CPU→GPU after creation + if (props_dev->use_managed_buffers) { + [(id)res->buffers[res->n_buffers].metal didModifyRange:NSMakeRange(0, size_step_aligned)]; + } } ggml_metal_log_allocated_size(res->dev->mtl_device, size_step_aligned); @@ -1582,6 +1621,13 @@ bool ggml_metal_buffer_is_shared(ggml_metal_buffer_t buf) { void ggml_metal_buffer_memset_tensor(ggml_metal_buffer_t buf, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { if (buf->is_shared) { memset((char *) tensor->data + offset, value, size); + + // Sync Managed buffer after CPU write + if (buf->dev->props.use_managed_buffers) { + struct ggml_metal_buffer_id bid = ggml_metal_buffer_get_id(buf, tensor); + [(id)bid.metal didModifyRange:NSMakeRange(bid.offs + offset, size)]; + } + return; } @@ -1610,6 +1656,13 @@ void ggml_metal_buffer_memset_tensor(ggml_metal_buffer_t buf, struct ggml_tensor void ggml_metal_buffer_set_tensor(ggml_metal_buffer_t buf, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { if (buf->is_shared) { memcpy((char *) tensor->data + offset, data, size); + + // Sync Managed buffer after CPU write + if (buf->dev->props.use_managed_buffers) { + struct ggml_metal_buffer_id bid = ggml_metal_buffer_get_id(buf, tensor); + [(id)bid.metal didModifyRange:NSMakeRange(bid.offs + offset, size)]; + } + return; } @@ -1646,9 +1699,6 @@ void ggml_metal_buffer_set_tensor(ggml_metal_buffer_t buf, struct ggml_tensor * } [cmd_buf addCompletedHandler:^(id cb) { - // TODO: can check for errors here - GGML_UNUSED(cb); - dispatch_semaphore_signal(completion_semaphore); }]; @@ -1663,6 +1713,14 @@ void ggml_metal_buffer_set_tensor(ggml_metal_buffer_t buf, struct ggml_tensor * void ggml_metal_buffer_get_tensor(ggml_metal_buffer_t buf, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { if (buf->is_shared) { + // For Managed buffers, sync GPU→CPU then direct memcpy + if (buf->dev->props.use_managed_buffers) { + struct ggml_metal_buffer_id bid = ggml_metal_buffer_get_id(buf, tensor); + @autoreleasepool { + [(id)bid.metal synchronizeResource]; + } + } + // Direct memcpy (Shared or Managed after sync) memcpy(data, (const char *) tensor->data + offset, size); return; } @@ -1700,7 +1758,9 @@ void ggml_metal_buffer_get_tensor(ggml_metal_buffer_t buf, const struct ggml_ten } void ggml_metal_buffer_clear(ggml_metal_buffer_t buf, uint8_t value) { - if (buf->is_shared) { + // For Managed buffers, use GPU blit to avoid reading unsynced data + if (buf->is_shared && !buf->dev->props.use_managed_buffers) { + // True Shared mode (unified memory): direct memset OK memset(buf->all_data, value, buf->all_size); return; }