From d2fabedf096f9b2f76b26515f38f543fc34c8aa1 Mon Sep 17 00:00:00 2001 From: Progeny Alpha Date: Fri, 13 Mar 2026 23:34:59 -0400 Subject: [PATCH] vulkan: fix chunked inter kernel state layout for PR #20443 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #20443 removed redundant state transposes from the graph and updated the autoregressive shader to use col*S_V+i (coalesced) instead of i*S_V+col (strided). The chunked inter kernel was not updated, causing uncoalesced state reads and a ~8% PP regression. Fix state_in load and final_out write to match the new layout. h_snapshots (h_out/h_in) are internal scratch and keep their existing layout since inter and output kernels agree. PP-512: 202 → 218 t/s. 16/16 tests pass. --- .../vulkan-shaders/gated_delta_net_chunk_inter.comp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net_chunk_inter.comp b/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net_chunk_inter.comp index 11cd0e18a8..0aa54e718f 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net_chunk_inter.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net_chunk_inter.comp @@ -60,7 +60,7 @@ void main() { float state[S_V]; [[unroll]] for (uint i = 0; i < S_V; i++) { - state[i] = state_in[state_base + i * S_V + col]; + state[i] = state_in[state_base + col * S_V + i]; } for (uint c = 0; c < n_chunks; c++) { @@ -121,6 +121,6 @@ void main() { // Write final state to dst at s_off [[unroll]] for (uint i = 0; i < S_V; i++) { - final_out[s_off + state_base + i * S_V + col] = state[i]; + final_out[s_off + state_base + col * S_V + i] = state[i]; } }