fix conv state update for llama-server parallel serving

This commit is contained in:
Yee Man Chan 2026-02-12 09:07:45 +08:00
parent 19cf704683
commit 63a15e3cbc
1 changed files with 5 additions and 2 deletions

View File

@ -42,8 +42,11 @@ static ggml_tensor * causal_conv1d(ggml_cgraph * gf, ggml_context * ctx0, ggml_t
conv_x->nb[1], conv_x->nb[2], n_seq_tokens * conv_x->nb[0]);
ggml_build_forward_expand(gf,
ggml_cpy(ctx0, last_conv_x,
ggml_view_1d(ctx0, conv_states_all, conv_state_size * n_seqs,
(kv_head * n_embd_r_total + qkv * conv_state_size) * ggml_element_size(conv_states_all))));
ggml_view_3d(ctx0, conv_states_all,
d_conv - 1, d_inner, n_seqs,
(d_conv - 1) * ggml_element_size(conv_states_all), // nb1: contiguous within one channel's conv taps
n_embd_r_total * ggml_element_size(conv_states_all), // nb2: stride between sequences (skip over K,V states)
(kv_head * n_embd_r_total + qkv * conv_state_size) * ggml_element_size(conv_states_all)))); // offset to first seq's Q/K/V state
// Reshape conv weight: GGUF [d_conv, 1, d_inner, 1] -> ggml_ssm_conv expects [d_conv, d_inner]
// GGUF stores as [d_conv, 1, d_inner, 1] with memory layout w[conv_step + channel * d_conv]
// vLLM stores as [d_inner, d_conv] with memory layout w[channel * d_conv + conv_step]