metal: handle command buffer failures gracefully in synchronize (#20306)
Replace GGML_ABORT("fatal error") in ggml_metal_synchronize() with
error flag + return. This aligns synchronize error handling with
graph_compute, which already returns GGML_STATUS_FAILED for the same
condition.
When a command buffer fails (e.g., iOS GPU access revocation during
backgrounding, macOS eGPU disconnect, OOM), the backend enters an
error state instead of killing the host process. Subsequent
graph_compute calls return GGML_STATUS_FAILED immediately. Recovery
requires recreating the backend.
Failed extra command buffers are properly released on the error path
to avoid Metal object leaks.
This commit is contained in:
parent
1dab5f5a44
commit
1a5631beaa
|
|
@ -75,6 +75,10 @@ struct ggml_metal {
|
||||||
// abort ggml_metal_graph_compute if callback returns true
|
// abort ggml_metal_graph_compute if callback returns true
|
||||||
ggml_abort_callback abort_callback;
|
ggml_abort_callback abort_callback;
|
||||||
void * abort_callback_data;
|
void * abort_callback_data;
|
||||||
|
|
||||||
|
// error state - set when a command buffer fails during synchronize
|
||||||
|
// once set, graph_compute will return GGML_STATUS_FAILED until the backend is recreated
|
||||||
|
bool has_error;
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
|
ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
|
||||||
|
|
@ -158,6 +162,8 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
|
||||||
res->capture_started = false;
|
res->capture_started = false;
|
||||||
res->capture_scope = nil;
|
res->capture_scope = nil;
|
||||||
|
|
||||||
|
res->has_error = false;
|
||||||
|
|
||||||
res->gf = nil;
|
res->gf = nil;
|
||||||
res->encode_async = nil;
|
res->encode_async = nil;
|
||||||
for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
|
for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
|
||||||
|
|
@ -246,7 +252,8 @@ void ggml_metal_synchronize(ggml_metal_t ctx) {
|
||||||
if (status == MTLCommandBufferStatusError) {
|
if (status == MTLCommandBufferStatusError) {
|
||||||
GGML_LOG_ERROR("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
|
GGML_LOG_ERROR("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
|
||||||
}
|
}
|
||||||
GGML_ABORT("fatal error");
|
ctx->has_error = true;
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -262,7 +269,15 @@ void ggml_metal_synchronize(ggml_metal_t ctx) {
|
||||||
if (status == MTLCommandBufferStatusError) {
|
if (status == MTLCommandBufferStatusError) {
|
||||||
GGML_LOG_ERROR("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
|
GGML_LOG_ERROR("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
|
||||||
}
|
}
|
||||||
GGML_ABORT("fatal error");
|
|
||||||
|
// release this and all remaining command buffers before returning
|
||||||
|
for (size_t j = i; j < ctx->cmd_bufs_ext.count; ++j) {
|
||||||
|
[ctx->cmd_bufs_ext[j] release];
|
||||||
|
}
|
||||||
|
[ctx->cmd_bufs_ext removeAllObjects];
|
||||||
|
|
||||||
|
ctx->has_error = true;
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
[cmd_buf release];
|
[cmd_buf release];
|
||||||
|
|
@ -414,6 +429,11 @@ bool ggml_metal_cpy_tensor_async(ggml_metal_t ctx_src, ggml_metal_t ctx_dst, con
|
||||||
}
|
}
|
||||||
|
|
||||||
enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph * gf) {
|
enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph * gf) {
|
||||||
|
if (ctx->has_error) {
|
||||||
|
GGML_LOG_ERROR("%s: backend is in error state from a previous command buffer failure - recreate the backend to recover\n", __func__);
|
||||||
|
return GGML_STATUS_FAILED;
|
||||||
|
}
|
||||||
|
|
||||||
// number of nodes encoded by the main thread (empirically determined)
|
// number of nodes encoded by the main thread (empirically determined)
|
||||||
const int n_main = MAX(64, 0.1*gf->n_nodes);
|
const int n_main = MAX(64, 0.1*gf->n_nodes);
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue