From bf13638d56a747f3c7e36fe766cbbb0a0bf8f6a7 Mon Sep 17 00:00:00 2001
From: Progeny Alpha <ProgenyAlpha@users.noreply.github.com>
Date: Fri, 13 Mar 2026 17:00:15 -0400
Subject: [PATCH] vulkan: enable coopmat chunked GDN output path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Lower GDN_CHUNK_THRESHOLD from UINT32_MAX to 2 and prefer the coopmat
output pipeline (cm1) when available, falling back to the scalar variant.

PP-512: ~206 → ~210 t/s on Radeon 890M (RDNA3.5).
---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 383840db7f..d1c470b1c1 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -10406,7 +10406,7 @@ static void ggml_vk_rwkv_wkv7(ggml_backend_vk_context * ctx, vk_context& subctx,
 }
 
 static constexpr uint32_t GDN_CHUNK_SIZE = 64;
-static constexpr uint32_t GDN_CHUNK_THRESHOLD = UINT32_MAX;  // Disabled
+static constexpr uint32_t GDN_CHUNK_THRESHOLD = 2;
 
 static void ggml_vk_gated_delta_net(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) {
     const ggml_tensor * src_q     = dst->src[0];
@@ -10472,7 +10472,9 @@ static void ggml_vk_gated_delta_net(ggml_backend_vk_context * ctx, vk_context& s
 
     vk_pipeline pl_intra  = ctx->device->pipeline_gated_delta_net_chunk_intra;
     vk_pipeline pl_inter  = ctx->device->pipeline_gated_delta_net_chunk_inter;
-    vk_pipeline pl_output = ctx->device->pipeline_gated_delta_net_chunk_output;
+    vk_pipeline pl_output = ctx->device->pipeline_gated_delta_net_chunk_output_cm
+                          ? ctx->device->pipeline_gated_delta_net_chunk_output_cm
+                          : ctx->device->pipeline_gated_delta_net_chunk_output;
 
     ggml_pipeline_request_descriptor_sets(ctx, pl_intra, 1);
     ggml_pipeline_request_descriptor_sets(ctx, pl_inter, 1);