fixed some comments

This commit is contained in:
Yee Man Chan 2026-01-06 11:35:25 +08:00
parent cfed14e31b
commit e3542ff8a2
1 changed files with 2 additions and 3 deletions

View File

@ -263,7 +263,6 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
ggml_tensor * state = build_rs(inp_rs, ssm_states_all, hparams.n_embd_s(), n_seqs);
state = ggml_reshape_4d(ctx0, state, head_dim, head_dim, n_head, n_seqs);
// Choose between build_kda_chunking and build_kda_recurrent based on n_tokens
// TODO: Currently only build_kda_recurrent is implemented
ggml_tensor * attn_out = n_seq_tokens > CHUNK_SIZE ?
build_kda_chunking(Qcur, Kcur, Vcur, g1, beta, state, causal_mask, identity, il) :
build_kda_recurrent(Qcur, Kcur, Vcur, g1, beta, state, causal_mask, identity, il);
@ -315,7 +314,6 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
} else if (is_mla) {
// === MLA Layer (Multi-head Latent Attention) without KV Cache ===
// Reference: vLLM mla.py
// TODO: Implement proper KV caching for MLA (requires custom cache format)
// Step 1: Q projection and reshape
// vLLM Kimi: q = q_proj(hidden_states), then view as [n_tokens, n_head, qk_head_dim]
@ -454,7 +452,8 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
}
/*
IMPORTANT: Currently build_kda_chunking is not implemented nor called
This is a ggml implementation of the naive_chunk_kda function of
https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/kda/naive.py
*/
ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
ggml_tensor * q,