mmq.cu: move amd wmma mmq/wmma switching behind IS_RDNA3

2026-01-02 15:05:30 -08:00 · 2026-01-02 15:05:30 -08:00 · 3326fa2387
parent a435c7725b
commit 3326fa2387
1 changed files with 22 additions and 16 deletions
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@ -333,23 +333,29 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
    }

    if (amd_wmma_available(cc)) {
-        // High expert counts almost always better on MMQ
-        // due to a large amount of graph splits
-        // https://github.com/ggml-org/llama.cpp/pull/18202
-        if (n_experts >= 64) {
-            return true;
-        }
-
-        switch (type) {
-            // These quants are really bad on MMQ
-            case GGML_TYPE_Q2_K:
-            case GGML_TYPE_Q6_K:
-            // These quants are usually worse but not always
-            case GGML_TYPE_IQ2_XS:
-            case GGML_TYPE_IQ2_S:
-                return ne11 <= 128;
-            default:
+        // RDNA 4 is consistently worse on rocblas
+        // https://github.com/ggml-org/llama.cpp/pull/18537#issuecomment-3706422301
+        if (GGML_CUDA_CC_IS_RDNA3(cc)) {
+            // High expert counts almost always better on MMQ
+            // due to a large amount of graph splits
+            // https://github.com/ggml-org/llama.cpp/pull/18202
+            if (n_experts >= 64) {
                return true;
+            }
+
+            switch (type) {
+                // These quants are really bad on MMQ
+                case GGML_TYPE_Q2_K:
+                case GGML_TYPE_Q6_K:
+                // These quants are usually worse but not always
+                case GGML_TYPE_IQ2_XS:
+                case GGML_TYPE_IQ2_S:
+                    return ne11 <= 128;
+                default:
+                    return true;
+            }
+        } else {
+            return true;
        }
    }