From d2ff4e23acd0724b44e0af72fd7e37fed4c1a6a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 10 Jan 2026 17:19:01 +0100 Subject: [PATCH] HIP: adjust RDNA3.5 MMQ kernel selction logic (#18666) --- ggml/src/ggml-cuda/mmq.cu | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index c9aa7024a9..9a69f41d15 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -333,28 +333,31 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t } if (amd_wmma_available(cc)) { - // RDNA 4 is consistently worse on rocblas - // https://github.com/ggml-org/llama.cpp/pull/18537#issuecomment-3706422301 if (GGML_CUDA_CC_IS_RDNA3(cc)) { - // High expert counts almost always better on MMQ - // due to a large amount of graph splits + // High expert counts are almost always better on MMQ due to + // the synchronization overhead in the cuBLAS/hipBLAS path: // https://github.com/ggml-org/llama.cpp/pull/18202 if (n_experts >= 64) { return true; } + // For some quantization types MMQ can have lower peak TOPS than hipBLAS + // so it's only faster for sufficiently small batch sizes: switch (type) { - // These quants are really bad on MMQ case GGML_TYPE_Q2_K: + return ne11 <= 128; case GGML_TYPE_Q6_K: - // These quants are usually worse but not always + return ne11 <= (GGML_CUDA_CC_IS_RDNA3_0(cc) ? 128 : 256); case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ2_S: - return ne11 <= 128; + return GGML_CUDA_CC_IS_RDNA3_5(cc) || ne11 <= 128; default: return true; } } + + // For RDNA4 MMQ is consistently faster than dequantization + hipBLAS: + // https://github.com/ggml-org/llama.cpp/pull/18537#issuecomment-3706422301 return true; }