From a65750ffb6a1cbd8fb91dd500bd29ca1cdab046b Mon Sep 17 00:00:00 2001
From: hogeheer499 <hogeheer499@gmail.com>
Date: Thu, 12 Mar 2026 23:09:41 +0100
Subject: [PATCH 1/3] ggml-cuda: fix UMA memory detection for HIP/ROCm on AMD
 APUs

AMD APUs report prop.integrated=1 which triggers the UMA memory
path from #17368. This overrides hipMemGetInfo() (accurate) with
/proc/meminfo MemAvailable (too low), losing ~30 GiB on a 128GB
Strix Halo system.

For HIP builds, only enter the UMA path when GGML_CUDA_ENABLE_UNIFIED_MEMORY
is explicitly set. This preserves correct behavior for both cases:
- Default: hipMemGetInfo() reports accurate TTM-backed memory
- GGML_CUDA_ENABLE_UNIFIED_MEMORY=1: /proc/meminfo is used (system RAM mode)

Tested on AMD Ryzen AI MAX+ 395, Radeon 8060S (gfx1151), 128GB, ROCm 7.1.

Fixes: ggml-org#18159
---
 ggml/src/ggml-cuda/ggml-cuda.cu | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index a31e843e15..0ed004f24a 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -4607,7 +4607,14 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t *
 
     // Check if UMA is explicitly enabled via environment variable
     bool uma_env = getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr;
+
+#if defined(GGML_USE_HIP)
+    // On AMD APUs, prop.integrated is true but hipMemGetInfo() already returns
+    // the correct TTM-backed memory. Only use the UMA path when explicitly requested.
+    bool is_uma = uma_env;
+#else
     bool is_uma = prop.integrated > 0 || uma_env;
+#endif
 
     if (is_uma) {
         // For UMA systems (like DGX Spark), use system memory info

From 40b234d6b4f723bd808d19e63de78b154207bb37 Mon Sep 17 00:00:00 2001
From: hogeheer499-commits <hogeheer499@gmail.com>
Date: Fri, 13 Mar 2026 17:37:40 +0100
Subject: [PATCH 2/3] Update ggml/src/ggml-cuda/ggml-cuda.cu
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
---
 ggml/src/ggml-cuda/ggml-cuda.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 0ed004f24a..335d695f38 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -4614,7 +4614,7 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t *
     bool is_uma = uma_env;
 #else
     bool is_uma = prop.integrated > 0 || uma_env;
-#endif
+#endif // defined(GGML_USE_HIP)
 
     if (is_uma) {
         // For UMA systems (like DGX Spark), use system memory info

From 97ae46e4601d474b9fb6d9d262ee74a3ea644775 Mon Sep 17 00:00:00 2001
From: hogeheer499-commits <hogeheer499@users.noreply.github.com>
Date: Thu, 19 Mar 2026 20:31:52 +0100
Subject: [PATCH 3/3] use max of hipMemGetInfo and /proc/meminfo for UMA free
 memory

instead of always overwriting with /proc/meminfo, take whichever
is higher. this way systems where hipMemGetInfo already reports
TTM-backed memory correctly (like Strix Halo 128GB) keep their
value, while systems where /proc/meminfo is higher still get the
full amount.

removes the HIP-specific #ifdef since the max approach works for
both CUDA and HIP.
---
 ggml/src/ggml-cuda/ggml-cuda.cu | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 335d695f38..a94f445777 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -4608,21 +4608,19 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t *
     // Check if UMA is explicitly enabled via environment variable
     bool uma_env = getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr;
 
-#if defined(GGML_USE_HIP)
-    // On AMD APUs, prop.integrated is true but hipMemGetInfo() already returns
-    // the correct TTM-backed memory. Only use the UMA path when explicitly requested.
-    bool is_uma = uma_env;
-#else
     bool is_uma = prop.integrated > 0 || uma_env;
-#endif // defined(GGML_USE_HIP)
 
     if (is_uma) {
-        // For UMA systems (like DGX Spark), use system memory info
         long available_memory_kb = 0;
         long free_swap_kb = 0;
 
         if (ggml_backend_cuda_get_available_uma_memory(&available_memory_kb, &free_swap_kb) && available_memory_kb > 0) {
-            *free = (size_t)available_memory_kb * 1024;
+            // use whichever value is higher — on some AMD APUs hipMemGetInfo already
+            // accounts for TTM-backed memory and returns more than /proc/meminfo
+            size_t proc_free = (size_t)available_memory_kb * 1024;
+            if (proc_free > *free) {
+                *free = proc_free;
+            }
         } else {
             GGML_LOG_ERROR("%s: /proc/meminfo reading failed, using cudaMemGetInfo\n", __func__);
         }