mmq.cu: tune mmq/wmma switching for RDNA

2026-01-01 21:50:51 -08:00 · 2026-01-01 21:50:51 -08:00 · a435c7725b
parent 69a9a68bf7
commit a435c7725b
1 changed files with 15 additions and 7 deletions
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@ -333,16 +333,24 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
    }

    if (amd_wmma_available(cc)) {
-        if (n_experts > 64 || ne11 <= 128) {
+        // High expert counts almost always better on MMQ
+        // due to a large amount of graph splits
+        // https://github.com/ggml-org/llama.cpp/pull/18202
+        if (n_experts >= 64) {
            return true;
        }
-        if (type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) {
-            return true;
+
+        switch (type) {
+            // These quants are really bad on MMQ
+            case GGML_TYPE_Q2_K:
+            case GGML_TYPE_Q6_K:
+            // These quants are usually worse but not always
+            case GGML_TYPE_IQ2_XS:
+            case GGML_TYPE_IQ2_S:
+                return ne11 <= 128;
+            default:
+                return true;
        }
-        if (ne11 <= 256 && (type == GGML_TYPE_Q4_K || type == GGML_TYPE_Q5_K)) {
-            return true;
-        }
-        return false;
    }

    return (!GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;