From 51e3d3855e293089a728057c54ce5682f6ba853c Mon Sep 17 00:00:00 2001
From: "Jiacheng (Jason) Chen"
 <76919340+jiachengjason@users.noreply.github.com>
Date: Sun, 28 Dec 2025 18:38:25 -0500
Subject: [PATCH 1/5] Patch perf regression for mmq kernels in ROCm

recover performance regression for https://github.com/ggml-org/llama.cpp/issues/17917
---
 ggml/src/ggml-cuda/mmq.cu | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
index 85692d4543..1a29797148 100644
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -333,7 +333,13 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
     }
 
     if (amd_wmma_available(cc)) {
-        return true;
+        if (ne11 <= 128 || type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) {
+            return true;
+        }
+        if (ne11 <= 256 && (type == GGML_TYPE_Q4_K || type == GGML_TYPE_Q5_K)) {
+            return true;
+        }
+        return false;
     }
 
     return (!GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;

From 69a9a68bf748eecb61b0baad02903d3ec6067ac9 Mon Sep 17 00:00:00 2001
From: jiachengjason <jasonchen.jiacheng@gmail.com>
Date: Mon, 29 Dec 2025 11:26:15 -0500
Subject: [PATCH 2/5] add n_experts branch like the cdna path

---
 ggml/src/ggml-cuda/mmq.cu | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
index 1a29797148..0a0d440a18 100644
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -333,7 +333,10 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
     }
 
     if (amd_wmma_available(cc)) {
-        if (ne11 <= 128 || type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) {
+        if (n_experts > 64 || ne11 <= 128) {
+            return true;
+        }
+        if (type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) {
             return true;
         }
         if (ne11 <= 256 && (type == GGML_TYPE_Q4_K || type == GGML_TYPE_Q5_K)) {

From a435c7725b17fffe0045c2d4b7e38fe9f42d4070 Mon Sep 17 00:00:00 2001
From: Beinsezii <beinsezii@gmail.com>
Date: Thu, 1 Jan 2026 21:50:51 -0800
Subject: [PATCH 3/5] mmq.cu: tune mmq/wmma switching for RDNA

---
 ggml/src/ggml-cuda/mmq.cu | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
index 0a0d440a18..ccb9ebed57 100644
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -333,16 +333,24 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
     }
 
     if (amd_wmma_available(cc)) {
-        if (n_experts > 64 || ne11 <= 128) {
+        // High expert counts almost always better on MMQ
+        // due to a large amount of graph splits
+        // https://github.com/ggml-org/llama.cpp/pull/18202
+        if (n_experts >= 64) {
             return true;
         }
-        if (type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) {
-            return true;
+
+        switch (type) {
+            // These quants are really bad on MMQ
+            case GGML_TYPE_Q2_K:
+            case GGML_TYPE_Q6_K:
+            // These quants are usually worse but not always
+            case GGML_TYPE_IQ2_XS:
+            case GGML_TYPE_IQ2_S:
+                return ne11 <= 128;
+            default:
+                return true;
         }
-        if (ne11 <= 256 && (type == GGML_TYPE_Q4_K || type == GGML_TYPE_Q5_K)) {
-            return true;
-        }
-        return false;
     }
 
     return (!GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;

From 3326fa2387e58b4650df288dd0bf6fee12e01205 Mon Sep 17 00:00:00 2001
From: Beinsezii <beinsezii@gmail.com>
Date: Fri, 2 Jan 2026 15:05:30 -0800
Subject: [PATCH 4/5] mmq.cu: move amd wmma mmq/wmma switching behind IS_RDNA3

---
 ggml/src/ggml-cuda/mmq.cu | 38 ++++++++++++++++++++++----------------
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
index ccb9ebed57..99d1a7b833 100644
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -333,23 +333,29 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
     }
 
     if (amd_wmma_available(cc)) {
-        // High expert counts almost always better on MMQ
-        // due to a large amount of graph splits
-        // https://github.com/ggml-org/llama.cpp/pull/18202
-        if (n_experts >= 64) {
-            return true;
-        }
-
-        switch (type) {
-            // These quants are really bad on MMQ
-            case GGML_TYPE_Q2_K:
-            case GGML_TYPE_Q6_K:
-            // These quants are usually worse but not always
-            case GGML_TYPE_IQ2_XS:
-            case GGML_TYPE_IQ2_S:
-                return ne11 <= 128;
-            default:
+        // RDNA 4 is consistently worse on rocblas
+        // https://github.com/ggml-org/llama.cpp/pull/18537#issuecomment-3706422301
+        if (GGML_CUDA_CC_IS_RDNA3(cc)) {
+            // High expert counts almost always better on MMQ
+            // due to a large amount of graph splits
+            // https://github.com/ggml-org/llama.cpp/pull/18202
+            if (n_experts >= 64) {
                 return true;
+            }
+
+            switch (type) {
+                // These quants are really bad on MMQ
+                case GGML_TYPE_Q2_K:
+                case GGML_TYPE_Q6_K:
+                // These quants are usually worse but not always
+                case GGML_TYPE_IQ2_XS:
+                case GGML_TYPE_IQ2_S:
+                    return ne11 <= 128;
+                default:
+                    return true;
+            }
+        } else {
+            return true;
         }
     }
 

From 3fef966d51e975a526bbe8079b092b0e1860bf9b Mon Sep 17 00:00:00 2001
From: Beinsezii <39478211+Beinsezii@users.noreply.github.com>
Date: Fri, 2 Jan 2026 15:57:43 -0800
Subject: [PATCH 5/5] Update ggml/src/ggml-cuda/mmq.cu
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
---
 ggml/src/ggml-cuda/mmq.cu | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
index 99d1a7b833..ceb95758d2 100644
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -354,9 +354,8 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
                 default:
                     return true;
             }
-        } else {
-            return true;
         }
+        return true;
     }
 
     return (!GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;