model : Kimi Linear fix conv state update (#19531)
* fix conv state update for llama-server parallel serving --------- Co-authored-by: Piotr Wilkin (ilintar) <piotr.wilkin@syndatis.com>
This commit is contained in:
parent
25224c8021
commit
33a56f90a6
|
|
@ -41,8 +41,11 @@ static ggml_tensor * causal_conv1d(ggml_cgraph * gf, ggml_context * ctx0, ggml_t
|
|||
conv_x->nb[1], conv_x->nb[2], n_seq_tokens * conv_x->nb[0]);
|
||||
ggml_build_forward_expand(gf,
|
||||
ggml_cpy(ctx0, last_conv_x,
|
||||
ggml_view_1d(ctx0, conv_states_all, conv_state_size * n_seqs,
|
||||
(kv_head * n_embd_r_total + qkv * conv_state_size) * ggml_element_size(conv_states_all))));
|
||||
ggml_view_3d(ctx0, conv_states_all,
|
||||
d_conv - 1, d_inner, n_seqs,
|
||||
(d_conv - 1) * ggml_element_size(conv_states_all), // nb1: contiguous within one channel's conv taps
|
||||
n_embd_r_total * ggml_element_size(conv_states_all), // nb2: stride between sequences (skip over K,V states)
|
||||
(kv_head * n_embd_r_total + qkv * conv_state_size) * ggml_element_size(conv_states_all)))); // offset to first seq's Q/K/V state
|
||||
// Reshape conv weight: GGUF [d_conv, 1, d_inner, 1] -> ggml_ssm_conv expects [d_conv, d_inner]
|
||||
// GGUF stores as [d_conv, 1, d_inner, 1] with memory layout w[conv_step + channel * d_conv]
|
||||
// vLLM stores as [d_inner, d_conv] with memory layout w[channel * d_conv + conv_step]
|
||||
|
|
|
|||
Loading…
Reference in New Issue