From e7f2f95c9a6d103d13bf25a1a2227ba8e51052b2 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Wed, 18 Feb 2026 16:06:29 -0700
Subject: [PATCH] ggml webgpu: Fix bug in dispatching large matrix-vector
 multiplication (#19535)

* Fix bug in dispatching large matrix-vector multiplication
---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 17bb2f4712..b5fee48056 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -1121,7 +1121,8 @@ static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx,
         uint32_t batches       = dst->ne[2] * dst->ne[3];
         uint32_t output_groups = CEIL_DIV(dst->ne[0], decisions->outputs_per_wg);
         uint32_t total_wg      = output_groups * batches;
-        wg_x                   = total_wg % ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension;
+        // TODO: split large sizes into multiple batches to avoid way over-provisioning workgroups
+        wg_x = std::min(total_wg, ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension);
         wg_y = CEIL_DIV(total_wg, ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension);
     } else if (use_fast) {
         auto decisions = static_cast<ggml_webgpu_mul_mat_shader_decisions *>(pipeline.context.get());