From 1a5631beaa2bfb9f76f177a8dbe5544dd6c2d528 Mon Sep 17 00:00:00 2001 From: Julian Pscheid Date: Mon, 9 Mar 2026 23:32:24 -0700 Subject: [PATCH] metal: handle command buffer failures gracefully in synchronize (#20306) Replace GGML_ABORT("fatal error") in ggml_metal_synchronize() with error flag + return. This aligns synchronize error handling with graph_compute, which already returns GGML_STATUS_FAILED for the same condition. When a command buffer fails (e.g., iOS GPU access revocation during backgrounding, macOS eGPU disconnect, OOM), the backend enters an error state instead of killing the host process. Subsequent graph_compute calls return GGML_STATUS_FAILED immediately. Recovery requires recreating the backend. Failed extra command buffers are properly released on the error path to avoid Metal object leaks. --- ggml/src/ggml-metal/ggml-metal-context.m | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal-context.m b/ggml/src/ggml-metal/ggml-metal-context.m index 5d3a8ce412..1136ce99b0 100644 --- a/ggml/src/ggml-metal/ggml-metal-context.m +++ b/ggml/src/ggml-metal/ggml-metal-context.m @@ -75,6 +75,10 @@ struct ggml_metal { // abort ggml_metal_graph_compute if callback returns true ggml_abort_callback abort_callback; void * abort_callback_data; + + // error state - set when a command buffer fails during synchronize + // once set, graph_compute will return GGML_STATUS_FAILED until the backend is recreated + bool has_error; }; ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) { @@ -158,6 +162,8 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) { res->capture_started = false; res->capture_scope = nil; + res->has_error = false; + res->gf = nil; res->encode_async = nil; for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) { @@ -246,7 +252,8 @@ void ggml_metal_synchronize(ggml_metal_t ctx) { if (status == MTLCommandBufferStatusError) { GGML_LOG_ERROR("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]); } - GGML_ABORT("fatal error"); + ctx->has_error = true; + return; } } } @@ -262,7 +269,15 @@ void ggml_metal_synchronize(ggml_metal_t ctx) { if (status == MTLCommandBufferStatusError) { GGML_LOG_ERROR("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]); } - GGML_ABORT("fatal error"); + + // release this and all remaining command buffers before returning + for (size_t j = i; j < ctx->cmd_bufs_ext.count; ++j) { + [ctx->cmd_bufs_ext[j] release]; + } + [ctx->cmd_bufs_ext removeAllObjects]; + + ctx->has_error = true; + return; } [cmd_buf release]; @@ -414,6 +429,11 @@ bool ggml_metal_cpy_tensor_async(ggml_metal_t ctx_src, ggml_metal_t ctx_dst, con } enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph * gf) { + if (ctx->has_error) { + GGML_LOG_ERROR("%s: backend is in error state from a previous command buffer failure - recreate the backend to recover\n", __func__); + return GGML_STATUS_FAILED; + } + // number of nodes encoded by the main thread (empirically determined) const int n_main = MAX(64, 0.1*gf->n_nodes);