vulkan: fix chunked inter kernel state layout for PR #20443
PR #20443 removed redundant state transposes from the graph and updated the autoregressive shader to use col*S_V+i (coalesced) instead of i*S_V+col (strided). The chunked inter kernel was not updated, causing uncoalesced state reads and a ~8% PP regression. Fix state_in load and final_out write to match the new layout. h_snapshots (h_out/h_in) are internal scratch and keep their existing layout since inter and output kernels agree. PP-512: 202 → 218 t/s. 16/16 tests pass.
This commit is contained in:
parent
efbde13283
commit
d2fabedf09
|
|
@ -60,7 +60,7 @@ void main() {
|
|||
|
||||
float state[S_V];
|
||||
[[unroll]] for (uint i = 0; i < S_V; i++) {
|
||||
state[i] = state_in[state_base + i * S_V + col];
|
||||
state[i] = state_in[state_base + col * S_V + i];
|
||||
}
|
||||
|
||||
for (uint c = 0; c < n_chunks; c++) {
|
||||
|
|
@ -121,6 +121,6 @@ void main() {
|
|||
|
||||
// Write final state to dst at s_off
|
||||
[[unroll]] for (uint i = 0; i < S_V; i++) {
|
||||
final_out[s_off + state_base + i * S_V + col] = state[i];
|
||||
final_out[s_off + state_base + col * S_V + i] = state[i];
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue