From a65750ffb6a1cbd8fb91dd500bd29ca1cdab046b Mon Sep 17 00:00:00 2001 From: hogeheer499 Date: Thu, 12 Mar 2026 23:09:41 +0100 Subject: [PATCH 1/3] ggml-cuda: fix UMA memory detection for HIP/ROCm on AMD APUs AMD APUs report prop.integrated=1 which triggers the UMA memory path from #17368. This overrides hipMemGetInfo() (accurate) with /proc/meminfo MemAvailable (too low), losing ~30 GiB on a 128GB Strix Halo system. For HIP builds, only enter the UMA path when GGML_CUDA_ENABLE_UNIFIED_MEMORY is explicitly set. This preserves correct behavior for both cases: - Default: hipMemGetInfo() reports accurate TTM-backed memory - GGML_CUDA_ENABLE_UNIFIED_MEMORY=1: /proc/meminfo is used (system RAM mode) Tested on AMD Ryzen AI MAX+ 395, Radeon 8060S (gfx1151), 128GB, ROCm 7.1. Fixes: ggml-org#18159 --- ggml/src/ggml-cuda/ggml-cuda.cu | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index a31e843e15..0ed004f24a 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -4607,7 +4607,14 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * // Check if UMA is explicitly enabled via environment variable bool uma_env = getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr; + +#if defined(GGML_USE_HIP) + // On AMD APUs, prop.integrated is true but hipMemGetInfo() already returns + // the correct TTM-backed memory. Only use the UMA path when explicitly requested. + bool is_uma = uma_env; +#else bool is_uma = prop.integrated > 0 || uma_env; +#endif if (is_uma) { // For UMA systems (like DGX Spark), use system memory info From 40b234d6b4f723bd808d19e63de78b154207bb37 Mon Sep 17 00:00:00 2001 From: hogeheer499-commits Date: Fri, 13 Mar 2026 17:37:40 +0100 Subject: [PATCH 2/3] Update ggml/src/ggml-cuda/ggml-cuda.cu MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Johannes Gäßler --- ggml/src/ggml-cuda/ggml-cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 0ed004f24a..335d695f38 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -4614,7 +4614,7 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * bool is_uma = uma_env; #else bool is_uma = prop.integrated > 0 || uma_env; -#endif +#endif // defined(GGML_USE_HIP) if (is_uma) { // For UMA systems (like DGX Spark), use system memory info From 97ae46e4601d474b9fb6d9d262ee74a3ea644775 Mon Sep 17 00:00:00 2001 From: hogeheer499-commits Date: Thu, 19 Mar 2026 20:31:52 +0100 Subject: [PATCH 3/3] use max of hipMemGetInfo and /proc/meminfo for UMA free memory instead of always overwriting with /proc/meminfo, take whichever is higher. this way systems where hipMemGetInfo already reports TTM-backed memory correctly (like Strix Halo 128GB) keep their value, while systems where /proc/meminfo is higher still get the full amount. removes the HIP-specific #ifdef since the max approach works for both CUDA and HIP. --- ggml/src/ggml-cuda/ggml-cuda.cu | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 335d695f38..a94f445777 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -4608,21 +4608,19 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * // Check if UMA is explicitly enabled via environment variable bool uma_env = getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr; -#if defined(GGML_USE_HIP) - // On AMD APUs, prop.integrated is true but hipMemGetInfo() already returns - // the correct TTM-backed memory. Only use the UMA path when explicitly requested. - bool is_uma = uma_env; -#else bool is_uma = prop.integrated > 0 || uma_env; -#endif // defined(GGML_USE_HIP) if (is_uma) { - // For UMA systems (like DGX Spark), use system memory info long available_memory_kb = 0; long free_swap_kb = 0; if (ggml_backend_cuda_get_available_uma_memory(&available_memory_kb, &free_swap_kb) && available_memory_kb > 0) { - *free = (size_t)available_memory_kb * 1024; + // use whichever value is higher — on some AMD APUs hipMemGetInfo already + // accounts for TTM-backed memory and returns more than /proc/meminfo + size_t proc_free = (size_t)available_memory_kb * 1024; + if (proc_free > *free) { + *free = proc_free; + } } else { GGML_LOG_ERROR("%s: /proc/meminfo reading failed, using cudaMemGetInfo\n", __func__); }