From b2d628dc51f76978b45d72bc15c27f19f991c98a Mon Sep 17 00:00:00 2001
From: jiachengjason <jason.chen@amd.com>
Date: Tue, 13 Jan 2026 11:09:50 -0500
Subject: [PATCH 1/4] tune mmq/rocblas switching for RDNA4

---
 ggml/src/ggml-cuda/mmq.cu | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
index 9a69f41d15..832a3feb85 100644
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -356,8 +356,22 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
             }
         }
 
-        // For RDNA4 MMQ is consistently faster than dequantization + hipBLAS:
-        // https://github.com/ggml-org/llama.cpp/pull/18537#issuecomment-3706422301
+        if (GGML_CUDA_CC_IS_RDNA4(cc)){
+
+            if (n_experts > 64 || ne11 <= 128) {
+                return true;
+            }
+
+            if (type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) {
+                return true;
+            }
+
+            if (ne11 <= 256 && (type == GGML_TYPE_Q4_K || type == GGML_TYPE_Q5_K)) {
+                return true;
+            }
+
+            return false;
+        }
         return true;
     }
 

From 57e1eaf71681145c3fa239b9e907353cc8b1ce03 Mon Sep 17 00:00:00 2001
From: jiachengjason <jason.chen@amd.com>
Date: Mon, 19 Jan 2026 12:31:42 -0500
Subject: [PATCH 2/4] additional tuning for qwen models

---
 ggml/src/ggml-cuda/ggml-cuda.cu | 27 +++++++++++++++++++++++++--
 ggml/src/ggml-cuda/mmq.cu       |  9 +++++++--
 2 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index c3ee2ea066..fb034e3219 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2141,6 +2141,27 @@ static bool ggml_cuda_should_fuse_mul_mat_vec_f(const ggml_tensor * tensor) {
     return use_mul_mat_vec_f;
 }
 
+static bool ggml_cuda_should_use_mmvq(ggml_type type, int cc, int64_t ncols_dst) {
+    if (ncols_dst > MMVQ_MAX_BATCH_SIZE) {
+        return false;
+    }
+
+    if (GGML_CUDA_CC_IS_RDNA4(cc)) {
+        switch (type) {
+            case GGML_TYPE_IQ2_S:
+            case GGML_TYPE_IQ2_XS:
+            case GGML_TYPE_IQ2_XXS:
+            case GGML_TYPE_IQ3_XXS:
+            case GGML_TYPE_IQ3_S:
+                return ncols_dst <= 4;
+            default:
+                break;
+        }
+    }
+
+    return true;
+}
+
 static bool ggml_cuda_should_fuse_mul_mat_vec_q(const ggml_tensor * tensor) {
     ggml_tensor *       src0 = tensor->src[0];
     ggml_tensor *       src1 = tensor->src[1];
@@ -2150,11 +2171,11 @@ static bool ggml_cuda_should_fuse_mul_mat_vec_q(const ggml_tensor * tensor) {
                                    ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) &&
                                    src0->view_src;
 
+    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
     bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear && src1->type == GGML_TYPE_F32 &&
-                             dst->type == GGML_TYPE_F32 && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
+                             dst->type == GGML_TYPE_F32 && ggml_cuda_should_use_mmvq(src0->type, cc, src1->ne[1]);
 
     // fusion is not universally faster on Pascal
-    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
     if (cc <= GGML_CUDA_CC_PASCAL) {
         return false;
     }
@@ -2211,6 +2232,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
 
             const int cc            = ggml_cuda_info().devices[id].cc;
             const int warp_size     = ggml_cuda_info().devices[id].warp_size;
+            use_mul_mat_vec_q       = use_mul_mat_vec_q         && ggml_cuda_should_use_mmvq(src0->type, cc, src1->ne[1]);
             use_mul_mat_q           = use_mul_mat_q             && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1], /*n_experts=*/0);
             use_mul_mat_f           = use_mul_mat_f             && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
             use_mul_mat_vec_f       = use_mul_mat_vec_f         && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
@@ -2219,6 +2241,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
     } else {
         const int cc            = ggml_cuda_info().devices[ctx.device].cc;
         const int warp_size     = ggml_cuda_info().devices[ctx.device].warp_size;
+        use_mul_mat_vec_q       = use_mul_mat_vec_q         && ggml_cuda_should_use_mmvq(src0->type, cc, src1->ne[1]);
         use_mul_mat_q           = use_mul_mat_q             && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1], /*n_experts=*/0);
         use_mul_mat_f           = use_mul_mat_f             && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
         use_mul_mat_vec_f       = use_mul_mat_vec_f         && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
index 832a3feb85..85b603563a 100644
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -357,8 +357,12 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
         }
 
         if (GGML_CUDA_CC_IS_RDNA4(cc)){
+            if (type == GGML_TYPE_IQ2_S || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS) {
+                return ne11 <= 128;
+            }
+            if (type == GGML_TYPE_MXFP4) return ne11 <= 256;
 
-            if (n_experts > 64 || ne11 <= 128) {
+            if (n_experts >= 64) {
                 return true;
             }
 
@@ -366,7 +370,8 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
                 return true;
             }
 
-            if (ne11 <= 256 && (type == GGML_TYPE_Q4_K || type == GGML_TYPE_Q5_K)) {
+            if (ne11 <= 256 && (type == GGML_TYPE_Q4_K || type == GGML_TYPE_Q5_K ||
+                type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S)) {
                 return true;
             }
 

From 4d5b79970c39a8a24709964ff4fa1d2475ef89e7 Mon Sep 17 00:00:00 2001
From: jiachengjason <jason.chen@amd.com>
Date: Fri, 23 Jan 2026 17:15:01 -0500
Subject: [PATCH 3/4] overall further tuning for all models

---
 ggml/src/ggml-cuda/ggml-cuda.cu |  1 -
 ggml/src/ggml-cuda/mmq.cu       | 36 +++++++++++++++++++++++----------
 2 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index fb034e3219..4b7d9b68b1 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2149,7 +2149,6 @@ static bool ggml_cuda_should_use_mmvq(ggml_type type, int cc, int64_t ncols_dst)
     if (GGML_CUDA_CC_IS_RDNA4(cc)) {
         switch (type) {
             case GGML_TYPE_IQ2_S:
-            case GGML_TYPE_IQ2_XS:
             case GGML_TYPE_IQ2_XXS:
             case GGML_TYPE_IQ3_XXS:
             case GGML_TYPE_IQ3_S:
diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
index 85b603563a..27d41695e4 100644
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -357,22 +357,36 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
         }
 
         if (GGML_CUDA_CC_IS_RDNA4(cc)){
-            if (type == GGML_TYPE_IQ2_S || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS) {
-                return ne11 <= 128;
-            }
-            if (type == GGML_TYPE_MXFP4) return ne11 <= 256;
-
             if (n_experts >= 64) {
                 return true;
             }
+            switch (type) {
+                case GGML_TYPE_IQ2_S:
+                case GGML_TYPE_Q6_K:
+                    return ne11 <= 128;
+                case GGML_TYPE_Q4_0:
+                case GGML_TYPE_Q4_1:
+                case GGML_TYPE_Q5_0:
+                case GGML_TYPE_Q5_1:
+                    return true;
+                case GGML_TYPE_Q5_K:
+                case GGML_TYPE_IQ3_XXS:
+                case GGML_TYPE_IQ3_S:
+                case GGML_TYPE_IQ2_XS:
+                case GGML_TYPE_IQ2_XXS:
+                case GGML_TYPE_Q2_K:
+                case GGML_TYPE_Q3_K:
+                case GGML_TYPE_IQ1_S:
+                case GGML_TYPE_Q4_K:
+                    return ne11 <= 256;
+                case GGML_TYPE_Q8_0:
+                case GGML_TYPE_IQ4_NL:
+                case GGML_TYPE_IQ4_XS:
+                    return ne11 <= 512;
 
-            if (type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) {
-                return true;
-            }
+                default:
+                    return false;
 
-            if (ne11 <= 256 && (type == GGML_TYPE_Q4_K || type == GGML_TYPE_Q5_K ||
-                type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S)) {
-                return true;
             }
 
             return false;

From 08d444578d9bae7e7f3fbe64471ebcb0013610f8 Mon Sep 17 00:00:00 2001
From: jiachengjason <jason.chen@amd.com>
Date: Mon, 26 Jan 2026 17:51:47 -0500
Subject: [PATCH 4/4] fine tuned gpt-oss configs

---
 ggml/src/ggml-cuda/mmq.cu | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
index 27d41695e4..fa39790504 100644
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -357,9 +357,9 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
         }
 
         if (GGML_CUDA_CC_IS_RDNA4(cc)){
-            if (n_experts >= 64) {
-                return true;
-            }
+            // if (n_experts >= 64) {
+            //     return true;
+            // }
             switch (type) {
                 case GGML_TYPE_IQ2_S:
                 case GGML_TYPE_Q6_K:
@@ -368,6 +368,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
                 case GGML_TYPE_Q4_1:
                 case GGML_TYPE_Q5_0:
                 case GGML_TYPE_Q5_1:
+                case GGML_TYPE_MXFP4:
                     return true;
                 case GGML_TYPE_Q5_K:
                 case GGML_TYPE_IQ3_XXS: