From b2d628dc51f76978b45d72bc15c27f19f991c98a Mon Sep 17 00:00:00 2001 From: jiachengjason Date: Tue, 13 Jan 2026 11:09:50 -0500 Subject: [PATCH 1/4] tune mmq/rocblas switching for RDNA4 --- ggml/src/ggml-cuda/mmq.cu | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index 9a69f41d15..832a3feb85 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -356,8 +356,22 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t } } - // For RDNA4 MMQ is consistently faster than dequantization + hipBLAS: - // https://github.com/ggml-org/llama.cpp/pull/18537#issuecomment-3706422301 + if (GGML_CUDA_CC_IS_RDNA4(cc)){ + + if (n_experts > 64 || ne11 <= 128) { + return true; + } + + if (type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) { + return true; + } + + if (ne11 <= 256 && (type == GGML_TYPE_Q4_K || type == GGML_TYPE_Q5_K)) { + return true; + } + + return false; + } return true; } From 57e1eaf71681145c3fa239b9e907353cc8b1ce03 Mon Sep 17 00:00:00 2001 From: jiachengjason Date: Mon, 19 Jan 2026 12:31:42 -0500 Subject: [PATCH 2/4] additional tuning for qwen models --- ggml/src/ggml-cuda/ggml-cuda.cu | 27 +++++++++++++++++++++++++-- ggml/src/ggml-cuda/mmq.cu | 9 +++++++-- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index c3ee2ea066..fb034e3219 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2141,6 +2141,27 @@ static bool ggml_cuda_should_fuse_mul_mat_vec_f(const ggml_tensor * tensor) { return use_mul_mat_vec_f; } +static bool ggml_cuda_should_use_mmvq(ggml_type type, int cc, int64_t ncols_dst) { + if (ncols_dst > MMVQ_MAX_BATCH_SIZE) { + return false; + } + + if (GGML_CUDA_CC_IS_RDNA4(cc)) { + switch (type) { + case GGML_TYPE_IQ2_S: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ3_S: + return ncols_dst <= 4; + default: + break; + } + } + + return true; +} + static bool ggml_cuda_should_fuse_mul_mat_vec_q(const ggml_tensor * tensor) { ggml_tensor * src0 = tensor->src[0]; ggml_tensor * src1 = tensor->src[1]; @@ -2150,11 +2171,11 @@ static bool ggml_cuda_should_fuse_mul_mat_vec_q(const ggml_tensor * tensor) { ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) && src0->view_src; + const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear && src1->type == GGML_TYPE_F32 && - dst->type == GGML_TYPE_F32 && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE; + dst->type == GGML_TYPE_F32 && ggml_cuda_should_use_mmvq(src0->type, cc, src1->ne[1]); // fusion is not universally faster on Pascal - const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; if (cc <= GGML_CUDA_CC_PASCAL) { return false; } @@ -2211,6 +2232,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor const int cc = ggml_cuda_info().devices[id].cc; const int warp_size = ggml_cuda_info().devices[id].warp_size; + use_mul_mat_vec_q = use_mul_mat_vec_q && ggml_cuda_should_use_mmvq(src0->type, cc, src1->ne[1]); use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1], /*n_experts=*/0); use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false); use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]); @@ -2219,6 +2241,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor } else { const int cc = ggml_cuda_info().devices[ctx.device].cc; const int warp_size = ggml_cuda_info().devices[ctx.device].warp_size; + use_mul_mat_vec_q = use_mul_mat_vec_q && ggml_cuda_should_use_mmvq(src0->type, cc, src1->ne[1]); use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1], /*n_experts=*/0); use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false); use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]); diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index 832a3feb85..85b603563a 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -357,8 +357,12 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t } if (GGML_CUDA_CC_IS_RDNA4(cc)){ + if (type == GGML_TYPE_IQ2_S || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS) { + return ne11 <= 128; + } + if (type == GGML_TYPE_MXFP4) return ne11 <= 256; - if (n_experts > 64 || ne11 <= 128) { + if (n_experts >= 64) { return true; } @@ -366,7 +370,8 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t return true; } - if (ne11 <= 256 && (type == GGML_TYPE_Q4_K || type == GGML_TYPE_Q5_K)) { + if (ne11 <= 256 && (type == GGML_TYPE_Q4_K || type == GGML_TYPE_Q5_K || + type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S)) { return true; } From 4d5b79970c39a8a24709964ff4fa1d2475ef89e7 Mon Sep 17 00:00:00 2001 From: jiachengjason Date: Fri, 23 Jan 2026 17:15:01 -0500 Subject: [PATCH 3/4] overall further tuning for all models --- ggml/src/ggml-cuda/ggml-cuda.cu | 1 - ggml/src/ggml-cuda/mmq.cu | 36 +++++++++++++++++++++++---------- 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index fb034e3219..4b7d9b68b1 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2149,7 +2149,6 @@ static bool ggml_cuda_should_use_mmvq(ggml_type type, int cc, int64_t ncols_dst) if (GGML_CUDA_CC_IS_RDNA4(cc)) { switch (type) { case GGML_TYPE_IQ2_S: - case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ3_S: diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index 85b603563a..27d41695e4 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -357,22 +357,36 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t } if (GGML_CUDA_CC_IS_RDNA4(cc)){ - if (type == GGML_TYPE_IQ2_S || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS) { - return ne11 <= 128; - } - if (type == GGML_TYPE_MXFP4) return ne11 <= 256; - if (n_experts >= 64) { return true; } + switch (type) { + case GGML_TYPE_IQ2_S: + case GGML_TYPE_Q6_K: + return ne11 <= 128; + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + return true; + case GGML_TYPE_Q5_K: + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ3_S: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_IQ1_S: + case GGML_TYPE_Q4_K: + return ne11 <= 256; + case GGML_TYPE_Q8_0: + case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ4_XS: + return ne11 <= 512; - if (type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) { - return true; - } + default: + return false; - if (ne11 <= 256 && (type == GGML_TYPE_Q4_K || type == GGML_TYPE_Q5_K || - type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S)) { - return true; } return false; From 08d444578d9bae7e7f3fbe64471ebcb0013610f8 Mon Sep 17 00:00:00 2001 From: jiachengjason Date: Mon, 26 Jan 2026 17:51:47 -0500 Subject: [PATCH 4/4] fine tuned gpt-oss configs --- ggml/src/ggml-cuda/mmq.cu | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index 27d41695e4..fa39790504 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -357,9 +357,9 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t } if (GGML_CUDA_CC_IS_RDNA4(cc)){ - if (n_experts >= 64) { - return true; - } + // if (n_experts >= 64) { + // return true; + // } switch (type) { case GGML_TYPE_IQ2_S: case GGML_TYPE_Q6_K: @@ -368,6 +368,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: + case GGML_TYPE_MXFP4: return true; case GGML_TYPE_Q5_K: case GGML_TYPE_IQ3_XXS: