From 030b09faa85b3074eab62cd4dcb84a781a0b396b Mon Sep 17 00:00:00 2001
From: hung <hung@mpb2019i7.local>
Date: Fri, 13 Feb 2026 12:26:49 -0500
Subject: [PATCH] metal: use mul_mv_ext for large n on non-simdgroup_mm GPUs

On GPUs without simdgroup_mm (e.g. AMD discrete), MUL_MAT with large n
(like pp512) falls through to the per-column mul_mv kernel, which
dispatches ~1.1M threadgroups vs ~280 for the multi-column mul_mv_ext.

Remove the ne11 <= 8 upper bound for non-simdgroup_mm devices so
mul_mv_ext handles all n values. Default r1ptg to 4 for ne11 > 8
instead of aborting.

Benchmarked on AMD Radeon Pro 5300M (Qwen2.5-1.5B-Q4_K_M):
- pp512: 103.18 -> 127.19 t/s (+23.3%)
- tg128: no regression (64.01 t/s)
- test-backend-ops MUL_MAT: 1009/1009 passed
---
 ggml/src/ggml-metal/ggml-metal-ops.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp
index c04e9fc7ff..0a0c302ce1 100644
--- a/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -1964,14 +1964,14 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
            op->src[0]->type == GGML_TYPE_Q8_0 ||
            op->src[0]->type == GGML_TYPE_MXFP4 ||
            op->src[0]->type == GGML_TYPE_IQ4_NL ||
-           false) && (ne11 >= 2 && ne11 <= 8)
+           false) && (ne11 >= 2 && (ne11 <= 8 || !props_dev->has_simdgroup_mm))
          ) ||
          (
           (
            op->src[0]->type == GGML_TYPE_Q4_K ||
            op->src[0]->type == GGML_TYPE_Q5_K ||
            op->src[0]->type == GGML_TYPE_Q6_K ||
-           false) && (ne11 >= 4 && ne11 <= 8)
+           false) && (ne11 >= 4 && (ne11 <= 8 || !props_dev->has_simdgroup_mm))
          )
         )
        ) {
@@ -2013,7 +2013,7 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
             case 5:
                 r1ptg = 5; break;
             default:
-                GGML_ABORT("unsupported ne11");
+                r1ptg = 4; break;
         };
 
         auto pipeline = ggml_metal_library_get_pipeline_mul_mv_ext(lib, op->src[0]->type, op->src[1]->type, nsg, nxpsg, r1ptg);