From 5d9f64c54e7f85aeaebd4e75b5b10810e2472993 Mon Sep 17 00:00:00 2001 From: uaruss Date: Sun, 29 Mar 2026 23:31:27 -0400 Subject: [PATCH] ggml-cuda: fix ROCm multi-GPU illegal memory access in recurrent state restore Remove early-return optimization in ggml_cuda_set_device() that caused hipErrorIllegalAddress on ROCm multi-GPU setups with hybrid recurrent models (Mamba/SSM architectures). On ROCm, hipGetDevice() can return an unexpected value on threads that have never explicitly called hipSetDevice(). If this value matches ctx->device, the early-return fires and hipSetDevice() is never called, causing the subsequent hipMemcpyAsync to fail with current device: -1. cudaSetDevice() with the already-active device is a near no-op in modern CUDA/ROCm drivers, so removing the optimization has negligible performance impact while eliminating this class of thread context bugs. Also add missing ggml_cuda_set_device() call in ggml_backend_cuda_set_tensor_async() for consistency with all other cudaMemcpyAsync call sites in this file. Fixes #21140 Tested on: 2x AMD Radeon AI Pro R9700 (gfx1201), ROCm 7.2.0 --- ggml/src/ggml-cuda/ggml-cuda.cu | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 9d2aacf4b2..f137ec7fb3 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -98,16 +98,9 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in GGML_ABORT(GGML_CUDA_NAME " error"); } -// this is faster on Windows -// probably because the Windows CUDA libraries forget to make this check before invoking the drivers +// always set device explicitly — early-return optimization is unsafe on ROCm multi-GPU +// with uninitialized thread contexts (see https://github.com/ggml-org/llama.cpp/issues/21140) void ggml_cuda_set_device(int device) { - int current_device; - CUDA_CHECK(cudaGetDevice(¤t_device)); - - if (device == current_device) { - return; - } - CUDA_CHECK(cudaSetDevice(device)); } @@ -2807,6 +2800,7 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); + ggml_cuda_set_device(cuda_ctx->device); CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream())); }