From 968929528c6a05e10249366fbe5f0330ad9af678 Mon Sep 17 00:00:00 2001
From: Beinsezii <39478211+Beinsezii@users.noreply.github.com>
Date: Tue, 6 Jan 2026 07:26:07 -0800
Subject: [PATCH] mmq.cu: tune mmq/rocblas switching for RDNA (#18537)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Patch perf regression for mmq kernels in ROCm

recover performance regression for https://github.com/ggml-org/llama.cpp/issues/17917

* add n_experts branch like the cdna path

* mmq.cu: tune mmq/wmma switching for RDNA

* mmq.cu: move amd wmma mmq/wmma switching behind IS_RDNA3

* Update ggml/src/ggml-cuda/mmq.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

---------

Co-authored-by: Jiacheng (Jason) Chen <76919340+jiachengjason@users.noreply.github.com>
Co-authored-by: jiachengjason <jasonchen.jiacheng@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
---
 ggml/src/ggml-cuda/mmq.cu | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
index 85692d4543..ceb95758d2 100644
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -333,6 +333,28 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
     }
 
     if (amd_wmma_available(cc)) {
+        // RDNA 4 is consistently worse on rocblas
+        // https://github.com/ggml-org/llama.cpp/pull/18537#issuecomment-3706422301
+        if (GGML_CUDA_CC_IS_RDNA3(cc)) {
+            // High expert counts almost always better on MMQ
+            // due to a large amount of graph splits
+            // https://github.com/ggml-org/llama.cpp/pull/18202
+            if (n_experts >= 64) {
+                return true;
+            }
+
+            switch (type) {
+                // These quants are really bad on MMQ
+                case GGML_TYPE_Q2_K:
+                case GGML_TYPE_Q6_K:
+                // These quants are usually worse but not always
+                case GGML_TYPE_IQ2_XS:
+                case GGML_TYPE_IQ2_S:
+                    return ne11 <= 128;
+                default:
+                    return true;
+            }
+        }
         return true;
     }