From 030b09faa85b3074eab62cd4dcb84a781a0b396b Mon Sep 17 00:00:00 2001 From: hung Date: Fri, 13 Feb 2026 12:26:49 -0500 Subject: [PATCH] metal: use mul_mv_ext for large n on non-simdgroup_mm GPUs On GPUs without simdgroup_mm (e.g. AMD discrete), MUL_MAT with large n (like pp512) falls through to the per-column mul_mv kernel, which dispatches ~1.1M threadgroups vs ~280 for the multi-column mul_mv_ext. Remove the ne11 <= 8 upper bound for non-simdgroup_mm devices so mul_mv_ext handles all n values. Default r1ptg to 4 for ne11 > 8 instead of aborting. Benchmarked on AMD Radeon Pro 5300M (Qwen2.5-1.5B-Q4_K_M): - pp512: 103.18 -> 127.19 t/s (+23.3%) - tg128: no regression (64.01 t/s) - test-backend-ops MUL_MAT: 1009/1009 passed --- ggml/src/ggml-metal/ggml-metal-ops.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp index c04e9fc7ff..0a0c302ce1 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.cpp +++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp @@ -1964,14 +1964,14 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) { op->src[0]->type == GGML_TYPE_Q8_0 || op->src[0]->type == GGML_TYPE_MXFP4 || op->src[0]->type == GGML_TYPE_IQ4_NL || - false) && (ne11 >= 2 && ne11 <= 8) + false) && (ne11 >= 2 && (ne11 <= 8 || !props_dev->has_simdgroup_mm)) ) || ( ( op->src[0]->type == GGML_TYPE_Q4_K || op->src[0]->type == GGML_TYPE_Q5_K || op->src[0]->type == GGML_TYPE_Q6_K || - false) && (ne11 >= 4 && ne11 <= 8) + false) && (ne11 >= 4 && (ne11 <= 8 || !props_dev->has_simdgroup_mm)) ) ) ) { @@ -2013,7 +2013,7 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) { case 5: r1ptg = 5; break; default: - GGML_ABORT("unsupported ne11"); + r1ptg = 4; break; }; auto pipeline = ggml_metal_library_get_pipeline_mul_mv_ext(lib, op->src[0]->type, op->src[1]->type, nsg, nxpsg, r1ptg);