From 51e3d3855e293089a728057c54ce5682f6ba853c Mon Sep 17 00:00:00 2001 From: "Jiacheng (Jason) Chen" <76919340+jiachengjason@users.noreply.github.com> Date: Sun, 28 Dec 2025 18:38:25 -0500 Subject: [PATCH 1/5] Patch perf regression for mmq kernels in ROCm recover performance regression for https://github.com/ggml-org/llama.cpp/issues/17917 --- ggml/src/ggml-cuda/mmq.cu | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index 85692d4543..1a29797148 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -333,7 +333,13 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t } if (amd_wmma_available(cc)) { - return true; + if (ne11 <= 128 || type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) { + return true; + } + if (ne11 <= 256 && (type == GGML_TYPE_Q4_K || type == GGML_TYPE_Q5_K)) { + return true; + } + return false; } return (!GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE; From 69a9a68bf748eecb61b0baad02903d3ec6067ac9 Mon Sep 17 00:00:00 2001 From: jiachengjason Date: Mon, 29 Dec 2025 11:26:15 -0500 Subject: [PATCH 2/5] add n_experts branch like the cdna path --- ggml/src/ggml-cuda/mmq.cu | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index 1a29797148..0a0d440a18 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -333,7 +333,10 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t } if (amd_wmma_available(cc)) { - if (ne11 <= 128 || type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) { + if (n_experts > 64 || ne11 <= 128) { + return true; + } + if (type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) { return true; } if (ne11 <= 256 && (type == GGML_TYPE_Q4_K || type == GGML_TYPE_Q5_K)) { From a435c7725b17fffe0045c2d4b7e38fe9f42d4070 Mon Sep 17 00:00:00 2001 From: Beinsezii Date: Thu, 1 Jan 2026 21:50:51 -0800 Subject: [PATCH 3/5] mmq.cu: tune mmq/wmma switching for RDNA --- ggml/src/ggml-cuda/mmq.cu | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index 0a0d440a18..ccb9ebed57 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -333,16 +333,24 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t } if (amd_wmma_available(cc)) { - if (n_experts > 64 || ne11 <= 128) { + // High expert counts almost always better on MMQ + // due to a large amount of graph splits + // https://github.com/ggml-org/llama.cpp/pull/18202 + if (n_experts >= 64) { return true; } - if (type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) { - return true; + + switch (type) { + // These quants are really bad on MMQ + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q6_K: + // These quants are usually worse but not always + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ2_S: + return ne11 <= 128; + default: + return true; } - if (ne11 <= 256 && (type == GGML_TYPE_Q4_K || type == GGML_TYPE_Q5_K)) { - return true; - } - return false; } return (!GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE; From 3326fa2387e58b4650df288dd0bf6fee12e01205 Mon Sep 17 00:00:00 2001 From: Beinsezii Date: Fri, 2 Jan 2026 15:05:30 -0800 Subject: [PATCH 4/5] mmq.cu: move amd wmma mmq/wmma switching behind IS_RDNA3 --- ggml/src/ggml-cuda/mmq.cu | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index ccb9ebed57..99d1a7b833 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -333,23 +333,29 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t } if (amd_wmma_available(cc)) { - // High expert counts almost always better on MMQ - // due to a large amount of graph splits - // https://github.com/ggml-org/llama.cpp/pull/18202 - if (n_experts >= 64) { - return true; - } - - switch (type) { - // These quants are really bad on MMQ - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q6_K: - // These quants are usually worse but not always - case GGML_TYPE_IQ2_XS: - case GGML_TYPE_IQ2_S: - return ne11 <= 128; - default: + // RDNA 4 is consistently worse on rocblas + // https://github.com/ggml-org/llama.cpp/pull/18537#issuecomment-3706422301 + if (GGML_CUDA_CC_IS_RDNA3(cc)) { + // High expert counts almost always better on MMQ + // due to a large amount of graph splits + // https://github.com/ggml-org/llama.cpp/pull/18202 + if (n_experts >= 64) { return true; + } + + switch (type) { + // These quants are really bad on MMQ + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q6_K: + // These quants are usually worse but not always + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ2_S: + return ne11 <= 128; + default: + return true; + } + } else { + return true; } } From 3fef966d51e975a526bbe8079b092b0e1860bf9b Mon Sep 17 00:00:00 2001 From: Beinsezii <39478211+Beinsezii@users.noreply.github.com> Date: Fri, 2 Jan 2026 15:57:43 -0800 Subject: [PATCH 5/5] Update ggml/src/ggml-cuda/mmq.cu MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Johannes Gäßler --- ggml/src/ggml-cuda/mmq.cu | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index 99d1a7b833..ceb95758d2 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -354,9 +354,8 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t default: return true; } - } else { - return true; } + return true; } return (!GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;