vulkan: enable coopmat chunked GDN output path

Lower GDN_CHUNK_THRESHOLD from UINT32_MAX to 2 and prefer the coopmat
output pipeline (cm1) when available, falling back to the scalar variant.

PP-512: ~206 → ~210 t/s on Radeon 890M (RDNA3.5).
This commit is contained in:
Progeny Alpha 2026-03-13 17:00:15 -04:00
parent 313ef74afe
commit bf13638d56
1 changed files with 4 additions and 2 deletions

View File

@ -10406,7 +10406,7 @@ static void ggml_vk_rwkv_wkv7(ggml_backend_vk_context * ctx, vk_context& subctx,
}
static constexpr uint32_t GDN_CHUNK_SIZE = 64;
static constexpr uint32_t GDN_CHUNK_THRESHOLD = UINT32_MAX; // Disabled
static constexpr uint32_t GDN_CHUNK_THRESHOLD = 2;
static void ggml_vk_gated_delta_net(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) {
const ggml_tensor * src_q = dst->src[0];
@ -10472,7 +10472,9 @@ static void ggml_vk_gated_delta_net(ggml_backend_vk_context * ctx, vk_context& s
vk_pipeline pl_intra = ctx->device->pipeline_gated_delta_net_chunk_intra;
vk_pipeline pl_inter = ctx->device->pipeline_gated_delta_net_chunk_inter;
vk_pipeline pl_output = ctx->device->pipeline_gated_delta_net_chunk_output;
vk_pipeline pl_output = ctx->device->pipeline_gated_delta_net_chunk_output_cm
? ctx->device->pipeline_gated_delta_net_chunk_output_cm
: ctx->device->pipeline_gated_delta_net_chunk_output;
ggml_pipeline_request_descriptor_sets(ctx, pl_intra, 1);
ggml_pipeline_request_descriptor_sets(ctx, pl_inter, 1);