From 27baad43d54e7fd875774aded764767cb109baa8 Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Tue, 2 Dec 2025 08:35:14 +0800 Subject: [PATCH 01/58] kimi linear model implementation --- src/models/kimi-linear.cpp | 429 +++++++++++++++++++++++++++++++++++++ 1 file changed, 429 insertions(+) create mode 100644 src/models/kimi-linear.cpp diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp new file mode 100644 index 0000000000..660cd06f0e --- /dev/null +++ b/src/models/kimi-linear.cpp @@ -0,0 +1,429 @@ +#include "models.h" + +llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params), model(model) { + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // Note: Kimi MLA does NOT use RoPE (rotary_emb=None in vLLM) + // So we don't need inp_pos + + // Only use recurrent state input for KDA layers + // MLA layers use direct softmax attention without KV cache + auto * inp_rs = build_rs_inp(); + + // Input for MLA layers (no KV cache) + auto * inp_no_cache = build_attn_inp_no_cache(); + + // Output ids for selecting which tokens to output + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + // Kimi dimension constants + const int64_t n_head = hparams.n_head(); + const int64_t head_dim = hparams.kda_head_dim > 0 ? hparams.kda_head_dim : 128; + const int64_t d_conv = hparams.kda_d_conv > 0 ? hparams.kda_d_conv : 4; + const int64_t d_inner = n_head * head_dim; // 32 * 128 = 4096 + const int64_t n_seqs = ubatch.n_seqs; + const int64_t n_seq_tokens = ubatch.n_seq_tokens; + + // Verify batch consistency for recurrent layers + GGML_ASSERT(n_seqs != 0); + GGML_ASSERT(ubatch.equal_seqs()); + GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); + + // MLA params + const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla > 0 ? hparams.n_embd_head_k_mla : 192; + const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla > 0 ? hparams.n_embd_head_v_mla : 128; + const int64_t kv_lora_rank = hparams.n_lora_kv > 0 ? hparams.n_lora_kv : 512; + // qk_rope_head_dim = 64 (from Kimi config), NOT hparams.n_rot (which is 72) + // Confirmed from tensor shape: wkv_a_mqa [2304, 576] = [n_embd, kv_lora_rank + qk_rope_head_dim] + const int64_t n_embd_head_qk_rope = 64; // config.qk_rope_head_dim + const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope; // 192 - 64 = 128 + + // Attention scale for KDA (1/sqrt(head_dim)) + const float kq_scale_kda = 1.0f / sqrtf((float)head_dim); + + // Attention scale for MLA + const float kq_scale_mla = 1.0f / sqrtf((float)n_embd_head_k_mla); + + for (int il = 0; il < n_layer; ++il) { + const auto & layer = model.layers[il]; + ggml_tensor * inpSA = inpL; + + // Attention Norm + cur = build_norm(inpL, layer.attn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // Check layer type by checking which tensors exist + // KDA layers have ssm_a_log tensor, MLA layers have wkv_a_mqa tensor + bool is_kda = (layer.ssm_a_log != nullptr); + bool is_mla = (layer.wkv_a_mqa != nullptr); + + if (is_kda) { + // === KDA Layer (Kimi Delta Attention) with Recurrent State === + // Reference: vLLM kda.py + + const auto * mctx_cur = inp_rs->mctx; + const auto kv_head = mctx_cur->get_head(); + + // Get conv states from r_l tensor (Q, K, V each have separate state) + ggml_tensor * conv_states_all = mctx_cur->get_r_l(il); + const int64_t conv_state_size = (d_conv - 1) * d_inner; + const int64_t n_embd_r_total = 3 * conv_state_size; // Q + K + V + ggml_tensor * conv_state_all = build_rs(inp_rs, conv_states_all, hparams.n_embd_r(), n_seqs); + // conv_state_all is [n_embd_r_total, n_seqs], split into Q, K, V + // Each conv state is [(d_conv-1) * d_inner] per sequence, need to reshape to [d_conv-1, d_inner, n_seqs] + // Memory layout: for each seq, Q state is first conv_state_size elements, then K, then V + // conv_state_all has stride: nb[0] = element_size, nb[1] = n_embd_r_total * element_size + + // View Q conv state: offset 0, size conv_state_size per seq + // conv_state_all is [n_embd_r_total, n_seqs] with memory layout: + // state[i + seq * n_embd_r_total] where i = conv_step + channel * (d_conv-1) + {0, conv_state_size, 2*conv_state_size} for Q/K/V + // We want [d_conv-1, d_inner, n_seqs] view: + // nb1 = (d_conv-1) * element_size (stride between channels) + // nb2 = n_embd_r_total * element_size (stride between seqs) + ggml_tensor * conv_state_q = ggml_view_3d(ctx0, conv_state_all, d_conv - 1, d_inner, n_seqs, + (d_conv - 1) * ggml_element_size(conv_state_all), // nb1: stride between channels + n_embd_r_total * ggml_element_size(conv_state_all), // nb2: stride between seqs + 0); // offset for Q + ggml_tensor * conv_state_k = ggml_view_3d(ctx0, conv_state_all, d_conv - 1, d_inner, n_seqs, + (d_conv - 1) * ggml_element_size(conv_state_all), + n_embd_r_total * ggml_element_size(conv_state_all), + conv_state_size * ggml_element_size(conv_state_all)); // offset for K + ggml_tensor * conv_state_v = ggml_view_3d(ctx0, conv_state_all, d_conv - 1, d_inner, n_seqs, + (d_conv - 1) * ggml_element_size(conv_state_all), + n_embd_r_total * ggml_element_size(conv_state_all), + 2 * conv_state_size * ggml_element_size(conv_state_all)); // offset for V + + // Step 1: Q, K, V projections -> [d_inner, n_tokens] + ggml_tensor * q_proj = ggml_mul_mat(ctx0, layer.wq, cur); + ggml_tensor * k_proj = ggml_mul_mat(ctx0, layer.wk, cur); + ggml_tensor * v_proj = ggml_mul_mat(ctx0, layer.wv, cur); + cb(q_proj, "kda_q_proj", il); + cb(k_proj, "kda_k_proj", il); + cb(v_proj, "kda_v_proj", il); + + // Step 2: Causal Conv1d for Q + // Reshape input: {d_inner, n_tokens} -> {d_inner, n_seq_tokens, n_seqs} + ggml_tensor * q_3d = ggml_reshape_3d(ctx0, q_proj, d_inner, n_seq_tokens, n_seqs); + + // Concat Q conv state and current input: {d_conv-1 + n_seq_tokens, d_inner, n_seqs} + ggml_tensor * conv_q = ggml_concat(ctx0, conv_state_q, ggml_transpose(ctx0, q_3d), 0); + + // Save last (d_conv-1) columns back to Q conv state + ggml_tensor * last_conv_q = ggml_view_3d(ctx0, conv_q, d_conv - 1, d_inner, n_seqs, + conv_q->nb[1], conv_q->nb[2], n_seq_tokens * conv_q->nb[0]); + ggml_build_forward_expand(gf, + ggml_cpy(ctx0, last_conv_q, + ggml_view_1d(ctx0, conv_states_all, conv_state_size * n_seqs, + kv_head * n_embd_r_total * ggml_element_size(conv_states_all)))); + + // Reshape conv weight: GGUF [d_conv, 1, d_inner, 1] -> ggml_ssm_conv expects [d_conv, d_inner] + // GGUF stores as [d_conv, 1, d_inner, 1] with memory layout w[conv_step + channel * d_conv] + // vLLM stores as [d_inner, d_conv] with memory layout w[channel * d_conv + conv_step] + // ggml_ssm_conv computes: c[conv_step + channel * d_conv] + // GGUF layout: [d_conv, 1, d_inner] or [d_conv, 1, d_inner, 1] -> reshape to [d_conv, d_inner] + ggml_tensor * conv_weight = nullptr; + if (layer.ssm_q_conv) { + // Reshape conv weight from [d_conv, 1, d_inner, 1] to [d_conv, d_inner] for ggml_ssm_conv + // Cast to F32 if quantized (ggml_ssm_conv requires float weights) + ggml_tensor * q_conv_f32 = layer.ssm_q_conv; + if (q_conv_f32->type != GGML_TYPE_F32) { + q_conv_f32 = ggml_cast(ctx0, q_conv_f32, GGML_TYPE_F32); + } + conv_weight = ggml_reshape_2d(ctx0, q_conv_f32, d_conv, d_inner); + } + + // Apply conv1d + ggml_tensor * Qcur; + if (conv_weight) { + // Make conv_q contiguous for ggml_ssm_conv + conv_q = ggml_cont(ctx0, conv_q); + + // ggml_ssm_conv output: {d_inner, n_seq_tokens, n_seqs} + Qcur = ggml_ssm_conv(ctx0, conv_q, conv_weight); + // Reshape to 2D for bias add: {d_inner, n_tokens} + Qcur = ggml_reshape_2d(ctx0, Qcur, d_inner, n_tokens); + if (layer.ssm_q_conv_b) { + Qcur = ggml_add(ctx0, Qcur, layer.ssm_q_conv_b); + } + Qcur = ggml_silu(ctx0, Qcur); + } else { + GGML_ABORT("KDA layer missing Q conv weight"); + } + + // K conv1d (with separate K conv state) + ggml_tensor * Kcur; + if (layer.ssm_k_conv) { + ggml_tensor * k_3d = ggml_reshape_3d(ctx0, k_proj, d_inner, n_seq_tokens, n_seqs); + ggml_tensor * conv_k = ggml_cont(ctx0, ggml_concat(ctx0, conv_state_k, ggml_transpose(ctx0, k_3d), 0)); + + // Save K conv state + ggml_tensor * last_conv_k = ggml_view_3d(ctx0, conv_k, d_conv - 1, d_inner, n_seqs, + conv_k->nb[1], conv_k->nb[2], n_seq_tokens * conv_k->nb[0]); + ggml_build_forward_expand(gf, + ggml_cpy(ctx0, last_conv_k, + ggml_view_1d(ctx0, conv_states_all, conv_state_size * n_seqs, + (kv_head * n_embd_r_total + conv_state_size) * ggml_element_size(conv_states_all)))); + + ggml_tensor * k_conv_f32 = layer.ssm_k_conv; + if (k_conv_f32->type != GGML_TYPE_F32) { + k_conv_f32 = ggml_cast(ctx0, k_conv_f32, GGML_TYPE_F32); + } + ggml_tensor * k_conv_weight = ggml_reshape_2d(ctx0, k_conv_f32, d_conv, d_inner); + Kcur = ggml_ssm_conv(ctx0, conv_k, k_conv_weight); + Kcur = ggml_reshape_2d(ctx0, Kcur, d_inner, n_tokens); + if (layer.ssm_k_conv_b) { + Kcur = ggml_add(ctx0, Kcur, layer.ssm_k_conv_b); + } + Kcur = ggml_silu(ctx0, Kcur); + } else { + GGML_ABORT("KDA layer missing K conv weight"); + } + + // V conv1d (with separate V conv state) + ggml_tensor * Vcur; + if (layer.ssm_v_conv) { + ggml_tensor * v_3d = ggml_reshape_3d(ctx0, v_proj, d_inner, n_seq_tokens, n_seqs); + ggml_tensor * conv_v = ggml_cont(ctx0, ggml_concat(ctx0, conv_state_v, ggml_transpose(ctx0, v_3d), 0)); + + // Save V conv state + ggml_tensor * last_conv_v = ggml_view_3d(ctx0, conv_v, d_conv - 1, d_inner, n_seqs, + conv_v->nb[1], conv_v->nb[2], n_seq_tokens * conv_v->nb[0]); + ggml_build_forward_expand(gf, + ggml_cpy(ctx0, last_conv_v, + ggml_view_1d(ctx0, conv_states_all, conv_state_size * n_seqs, + (kv_head * n_embd_r_total + 2 * conv_state_size) * ggml_element_size(conv_states_all)))); + + ggml_tensor * v_conv_f32 = layer.ssm_v_conv; + if (v_conv_f32->type != GGML_TYPE_F32) { + v_conv_f32 = ggml_cast(ctx0, v_conv_f32, GGML_TYPE_F32); + } + ggml_tensor * v_conv_weight = ggml_reshape_2d(ctx0, v_conv_f32, d_conv, d_inner); + Vcur = ggml_ssm_conv(ctx0, conv_v, v_conv_weight); + Vcur = ggml_reshape_2d(ctx0, Vcur, d_inner, n_tokens); + if (layer.ssm_v_conv_b) { + Vcur = ggml_add(ctx0, Vcur, layer.ssm_v_conv_b); + } + Vcur = ggml_silu(ctx0, Vcur); + } else { + GGML_ABORT("KDA layer missing V conv weight"); + } + + // Step 3: Compute g1 (forget gate) + // g1 = -exp(A_log) * softplus(f_b(f_a(x)) + dt_bias) + ggml_tensor * f_a = ggml_mul_mat(ctx0, layer.ssm_f_a, cur); + ggml_tensor * g1 = ggml_mul_mat(ctx0, layer.ssm_f_b, f_a); + g1 = ggml_add(ctx0, g1, layer.ssm_dt_b); + g1 = ggml_softplus(ctx0, g1); + g1 = ggml_reshape_3d(ctx0, g1, head_dim, n_head, n_tokens); + + // A_log shape is [1, n_head] or [1, n_head, 1, 1], need to broadcast to [head_dim, n_head, n_tokens] + // First compute -exp(A_log), then reshape for broadcasting + ggml_tensor * A_neg_exp = ggml_neg(ctx0, ggml_exp(ctx0, layer.ssm_a_log)); + // Reshape to [1, n_head, 1] for broadcasting with g1 [head_dim, n_head, n_tokens] + A_neg_exp = ggml_reshape_3d(ctx0, A_neg_exp, 1, n_head, 1); + g1 = ggml_mul(ctx0, g1, A_neg_exp); + cb(g1, "kda_g1", il); + + // Step 4: Compute beta (mixing coefficient) + ggml_tensor * beta = ggml_mul_mat(ctx0, layer.ssm_beta, cur); + beta = ggml_sigmoid(ctx0, beta); + cb(beta, "kda_beta", il); + + // Step 5: Reshape for KDA recurrence + // {n_embd, n_tokens} -> {n_embd, n_seq_tokens, n_seqs} + cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs); + + Qcur = ggml_cont(ctx0, ggml_reshape_4d(ctx0, Qcur, head_dim, n_head, n_seq_tokens, n_seqs)); + Kcur = ggml_cont(ctx0, ggml_reshape_4d(ctx0, Kcur, head_dim, n_head, n_seq_tokens, n_seqs)); + Vcur = ggml_cont(ctx0, ggml_reshape_4d(ctx0, Vcur, head_dim, n_head, n_seq_tokens, n_seqs)); + g1 = ggml_cont(ctx0, ggml_reshape_4d(ctx0, g1, head_dim, n_head, n_seq_tokens, n_seqs)); + beta = ggml_cont(ctx0, ggml_reshape_3d(ctx0, beta, n_head, n_seq_tokens, n_seqs)); + + cb(Qcur, "kda_Q", il); + cb(Kcur, "kda_K", il); + cb(Vcur, "kda_V", il); + + // Step 6: Get SSM state and compute KDA recurrence using ggml_kda_scan + ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il); + + // Use build_rs with lambda pattern (like Mamba SSM scan) + auto get_kda_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) { + ggml_tensor * h_state = ggml_reshape_4d(ctx, states, head_dim, head_dim, n_head, mctx_cur->get_size()); + // Call ggml_kda_scan which implements the correct KDA recurrence + return ggml_kda_scan(ctx, h_state, Qcur, Kcur, Vcur, g1, beta, ids); + }; + + ggml_tensor * y_kda = build_rs(inp_rs, ssm_states_all, hparams.n_embd_s(), n_seqs, get_kda_rows); + cb(y_kda, "kda_scan_out", il); + + // Store updated state back + // y_kda contains: [attention_output (head_dim * n_head * n_seq_tokens * n_seqs), new_state (head_dim * head_dim * n_head * n_seqs)] + const int64_t attn_out_size = head_dim * n_head * n_seq_tokens * n_seqs; + const int64_t state_size = head_dim * head_dim * n_head; + ggml_build_forward_expand(gf, + ggml_cpy(ctx0, + ggml_view_1d(ctx0, y_kda, state_size * n_seqs, attn_out_size * ggml_element_size(y_kda)), + ggml_view_1d(ctx0, ssm_states_all, state_size * n_seqs, kv_head * state_size * ggml_element_size(ssm_states_all)))); + + // Extract attention output + ggml_tensor * attn_out = ggml_view_1d(ctx0, y_kda, attn_out_size, 0); + attn_out = ggml_reshape_3d(ctx0, attn_out, head_dim, n_head, n_seq_tokens * n_seqs); + cb(attn_out, "kda_attn_out", il); + + // Step 7: Output gating g2 = g_b(g_a(x)) + ggml_tensor * cur_2d = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs); + ggml_tensor * g_a = ggml_mul_mat(ctx0, layer.ssm_g_a, cur_2d); + ggml_tensor * g2 = ggml_mul_mat(ctx0, layer.ssm_g_b, g_a); + g2 = ggml_reshape_3d(ctx0, g2, head_dim, n_head, n_seq_tokens * n_seqs); + + // Step 8: Apply o_norm with sigmoid gating + // Note: Kimi model uses sigmoid gating, not SiLU (despite FusedRMSNormGated default being swish) + // Formula: output = RMSNorm(x) * sigmoid(g) + ggml_tensor * normed = build_norm(attn_out, layer.ssm_o_norm, layer.ssm_o_norm_b, LLM_NORM_RMS, il); + ggml_tensor * gate = ggml_sigmoid(ctx0, g2); + ggml_tensor * gated = ggml_mul(ctx0, normed, gate); + + // Step 9: Output projection + gated = ggml_cont_2d(ctx0, gated, d_inner, n_tokens); + cur = ggml_mul_mat(ctx0, layer.wo, gated); + cb(cur, "kda_out", il); + + + GGML_UNUSED(d_conv); + GGML_UNUSED(kq_scale_kda); + + } else if (is_mla) { + // === MLA Layer (Multi-head Latent Attention) without KV Cache === + // Reference: vLLM mla.py + // TODO: Implement proper KV caching for MLA (requires custom cache format) + + // Step 1: Q projection and reshape + // vLLM Kimi: q = q_proj(hidden_states), then view as [n_tokens, n_head, qk_head_dim] + // Note: Kimi MLA does NOT use RoPE (rotary_emb=None in vLLM) + ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.wq, cur); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k_mla, n_head, n_tokens); + cb(Qcur, "mla_Q", il); + + // Step 2: KV compression + // kv_lora = kv_a_proj_with_mqa(hidden_states) -> [kv_lora_rank + qk_rope_head_dim, n_tokens] + ggml_tensor * kv_lora = ggml_mul_mat(ctx0, layer.wkv_a_mqa, cur); + + // Split: kv_c = kv_lora[:kv_lora_rank], k_pe = kv_lora[kv_lora_rank:] + ggml_tensor * kv_c = ggml_view_2d(ctx0, kv_lora, kv_lora_rank, n_tokens, + ggml_row_size(kv_lora->type, kv_lora_rank + n_embd_head_qk_rope), 0); + ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_lora, n_embd_head_qk_rope, 1, n_tokens, + ggml_row_size(kv_lora->type, kv_lora_rank + n_embd_head_qk_rope), + ggml_row_size(kv_lora->type, kv_lora_rank + n_embd_head_qk_rope), + ggml_row_size(kv_lora->type, kv_lora_rank)); + + // Note: Kimi MLA does NOT apply RoPE (rotary_emb=None in vLLM) + // k_pe is used directly without RoPE + + // Normalize kv_c + kv_c = build_norm(kv_c, layer.attn_kv_a_norm, nullptr, LLM_NORM_RMS, il); + + // KV decompression: kv = kv_b_proj(kv_c_normed) + ggml_tensor * kv = ggml_mul_mat(ctx0, layer.wkv_b, kv_c); + const int64_t kv_per_head = n_embd_head_qk_nope + n_embd_head_v_mla; + + // Split kv into k_nope and v + ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens, + ggml_row_size(kv->type, kv_per_head), + ggml_row_size(kv->type, kv_per_head * n_head), 0); + ggml_tensor * Vcur = ggml_view_3d(ctx0, kv, n_embd_head_v_mla, n_head, n_tokens, + ggml_row_size(kv->type, kv_per_head), + ggml_row_size(kv->type, kv_per_head * n_head), + ggml_row_size(kv->type, n_embd_head_qk_nope)); + k_nope = ggml_cont(ctx0, k_nope); + Vcur = ggml_cont(ctx0, Vcur); + + // Concatenate k_nope + k_pe (broadcast k_pe to all heads) + // K = [k_nope, k_pe] where k_nope is [qk_nope_head_dim, n_head, n_tokens] + // and k_pe is [qk_rope_head_dim, 1, n_tokens] broadcast to all heads + k_pe = ggml_cont(ctx0, k_pe); + // Need to broadcast k_pe from [qk_rope, 1, n_tokens] to [qk_rope, n_head, n_tokens] + ggml_tensor * k_pe_target = ggml_new_tensor_3d(ctx0, k_pe->type, n_embd_head_qk_rope, n_head, n_tokens); + ggml_tensor * k_pe_repeated = ggml_repeat(ctx0, k_pe, k_pe_target); + ggml_tensor * Kcur = ggml_concat(ctx0, k_nope, k_pe_repeated, 0); + cb(Kcur, "mla_K", il); + cb(Vcur, "mla_V", il); + + // Direct softmax attention (without KV cache) + // Use build_attn with inp_no_cache for proper mask handling + cur = build_attn(inp_no_cache, layer.wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il); + cb(cur, "mla_out", il); + + } else { + // Unknown layer type - this should not happen + GGML_ABORT("Kimi layer is neither KDA nor MLA - missing required tensors"); + } + + // On last layer, select only the output tokens + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + // Residual + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // FFN Norm + cur = build_norm(ffn_inp, layer.ffn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + // FFN / MoE + if (layer.ffn_gate_inp) { + // MoE layer + // Kimi uses moe_renormalize=True and routed_scaling_factor (stored as expert_weights_scale) = 2.446 + ggml_tensor * moe_out = build_moe_ffn(cur, layer.ffn_gate_inp, layer.ffn_up_exps, layer.ffn_gate_exps, layer.ffn_down_exps, + layer.ffn_exp_probs_b, hparams.n_expert, hparams.n_expert_used, + LLM_FFN_SILU, true, true, hparams.expert_weights_scale, + (llama_expert_gating_func_type) hparams.expert_gating_func, il); + cb(moe_out, "ffn_moe_out", il); + + // Shared expert (if present) + if (layer.ffn_gate_shexp) { + ggml_tensor * ffn_shexp = build_ffn(cur, + layer.ffn_up_shexp, NULL, NULL, + layer.ffn_gate_shexp, NULL, NULL, + layer.ffn_down_shexp, NULL, NULL, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(ffn_shexp, "ffn_shexp", il); + + cur = ggml_add(ctx0, moe_out, ffn_shexp); + cb(cur, "ffn_out", il); + } else { + cur = moe_out; + } + } else if (layer.ffn_gate) { + // Dense FFN layer + cur = build_ffn(cur, layer.ffn_up, NULL, NULL, layer.ffn_gate, NULL, NULL, + layer.ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } else { + // No FFN - this should not happen in Kimi + GGML_ABORT("Kimi layer missing FFN tensors"); + } + + // Residual + cur = ggml_add(ctx0, cur, ffn_inp); + inpL = cur; + } + + // Final Norm + cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + // Output + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); + + GGML_UNUSED(n_embd_head_qk_nope); +} From 84f822c5a58b0967e312c85cea4f3bbbea97184e Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Tue, 2 Dec 2025 08:51:09 +0800 Subject: [PATCH 02/58] kimi linear convert_hf_to_gguf --- convert_hf_to_gguf.py | 293 ++++++++++++++++++++++++++++++++++++++++++ src/models/models.h | 6 + 2 files changed, 299 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index a54cce887b..11dd9f610a 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -563,6 +563,10 @@ class ModelBase: gguf.MODEL_TENSOR.A_ENC_EMBD_POS, gguf.MODEL_TENSOR.ALTUP_CORRECT_COEF, gguf.MODEL_TENSOR.ALTUP_PREDICT_COEF, + # Kimi KDA conv weights should be F32 + gguf.MODEL_TENSOR.SSM_CONV1D_Q, + gguf.MODEL_TENSOR.SSM_CONV1D_K, + gguf.MODEL_TENSOR.SSM_CONV1D_V, ) ) or new_name[-7:] not in (".weight", ".lora_a", ".lora_b") @@ -4976,6 +4980,295 @@ class CodeShellModel(TextModel): self.gguf_writer.add_rope_scaling_factor(1.0) +@ModelBase.register("KimiLinearModel", "KimiLinearForCausalLM") +class KimiLinearModel(TextModel): + """Kimi-Linear model with hybrid MLA+KDA architecture""" + model_arch = gguf.MODEL_ARCH.KIMI_LINEAR + + _experts: list[dict[str, Tensor]] | None = None + + def set_gguf_parameters(self): + self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + + # Use find_hparam for context length + # Kimi uses model_max_length + n_ctx = self.find_hparam(["max_position_embeddings", "model_max_length", "n_ctx", "n_positions"], optional=True) + if n_ctx is not None: + self.gguf_writer.add_context_length(n_ctx) + else: + # Default to 4096 if not found + logger.warning("No context length found in config, defaulting to 4096") + self.gguf_writer.add_context_length(4096) + + self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_file_type(self.ftype) + + # KDA & MLA params + # Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv + linear_attn_config = self.hparams.get("linear_attn_config", {}) + ssm_d_conv = self.hparams.get("ssm_d_conv") or linear_attn_config.get("short_conv_kernel_size") + if ssm_d_conv is not None: + self.gguf_writer.add_ssm_conv_kernel(ssm_d_conv) + + # MLA params - use add_* methods that handle arch substitution + # Support both HuggingFace naming (q_lora_rank, kv_lora_rank) and internal naming (n_lora_q, n_lora_kv) + q_lora_rank = self.hparams.get("q_lora_rank", self.hparams.get("n_lora_q")) + kv_lora_rank = self.hparams.get("kv_lora_rank", self.hparams.get("n_lora_kv")) + + if q_lora_rank is not None: + self.gguf_writer.add_q_lora_rank(q_lora_rank) + if kv_lora_rank is not None: + self.gguf_writer.add_kv_lora_rank(kv_lora_rank) + + # MLA head dimensions + # Support HuggingFace naming: qk_nope_head_dim, qk_rope_head_dim, v_head_dim + qk_nope_head_dim = self.hparams.get("qk_nope_head_dim") + qk_rope_head_dim = self.hparams.get("qk_rope_head_dim", self.hparams.get("n_rot")) + v_head_dim = self.hparams.get("v_head_dim") + + # Calculate n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim + if "n_embd_head_k_mla" in self.hparams: + self.gguf_writer.add_key_length_mla(self.hparams["n_embd_head_k_mla"]) + elif qk_nope_head_dim is not None and qk_rope_head_dim is not None: + n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim + self.gguf_writer.add_key_length_mla(n_embd_head_k_mla) + + # n_embd_head_v_mla = v_head_dim + if "n_embd_head_v_mla" in self.hparams: + self.gguf_writer.add_value_length_mla(self.hparams["n_embd_head_v_mla"]) + elif v_head_dim is not None: + self.gguf_writer.add_value_length_mla(v_head_dim) + + # Rotation - use qk_rope_head_dim for Kimi + rope_dim = self.hparams.get("qk_rope_head_dim") or self.hparams.get("n_rot") + if rope_dim is not None: + self.gguf_writer.add_rope_dimension_count(rope_dim) + else: + # Default to head_dim + head_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(head_dim) + + self.gguf_writer.add_rope_freq_base(self.hparams.get("rope_theta", 10000.0)) + + # MoE params + n_experts = self.hparams.get("num_local_experts", self.hparams.get("num_experts")) + if n_experts is not None: + self.gguf_writer.add_expert_count(n_experts) + # Support both num_experts_per_tok and num_experts_per_token + n_experts_used = self.hparams.get("num_experts_per_tok", self.hparams.get("num_experts_per_token")) + if n_experts_used is not None: + self.gguf_writer.add_expert_used_count(n_experts_used) + + # moe_intermediate_size (1024 for Kimi) + moe_intermediate_size = self.hparams.get("moe_intermediate_size") + if moe_intermediate_size is not None: + self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) + + # num_shared_experts (1 for Kimi) + num_shared_experts = self.hparams.get("num_shared_experts") + if num_shared_experts is not None: + self.gguf_writer.add_expert_shared_count(num_shared_experts) + + # first_k_dense_replace (1 for Kimi - first layer uses dense MLP) + first_k_dense_replace = self.hparams.get("first_k_dense_replace") + if first_k_dense_replace is not None: + self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace) + + # Expert gating function (sigmoid for Kimi) + moe_router_activation_func = self.hparams.get("moe_router_activation_func", "sigmoid") + if moe_router_activation_func == "sigmoid": + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) + elif moe_router_activation_func == "softmax": + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX) + else: + logger.warning(f"Unknown moe_router_activation_func: {moe_router_activation_func}, defaulting to sigmoid") + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) + + # Routed scaling factor (expert_weights_scale = 2.446 for Kimi) + routed_scaling_factor = self.hparams.get("routed_scaling_factor") + if routed_scaling_factor is not None: + self.gguf_writer.add_expert_weights_scale(routed_scaling_factor) + + def set_vocab(self): + # Kimi uses TikToken tokenizer - load via transformers + from transformers import AutoTokenizer + + dir_model = self.dir_model + vocab_size = self.hparams["vocab_size"] + + logger.info(f"Loading TikToken tokenizer from {dir_model}") + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) + + tokens: list[str] = [] + toktypes: list[int] = [] + + # Get tokenizer pre string + tokpre = self.get_vocab_base_pre(tokenizer) + + # Build vocab from tokenizer + merges = [] + vocab = {} + + # TikToken stores vocab in mergeable_ranks + if hasattr(tokenizer, 'mergeable_ranks'): + mergeable_ranks = tokenizer.mergeable_ranks + for token, rank in mergeable_ranks.items(): + vocab[self._token_bytes_to_string(token)] = rank + if len(token) == 1: + continue + # Build merges + merged = self._bpe(mergeable_ranks, token, max_rank=rank) + if len(merged) == 2: + merges.append(' '.join(map(self._token_bytes_to_string, merged))) + else: + # Fallback: get vocab directly + vocab = {tok: idx for tok, idx in tokenizer.get_vocab().items()} + + # Get special tokens + added_vocab = {} + if hasattr(tokenizer, 'special_tokens'): + added_vocab = tokenizer.special_tokens + elif hasattr(tokenizer, 'added_tokens_encoder'): + added_vocab = tokenizer.added_tokens_encoder + + # Combine vocab + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()} + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + elif i in added_vocab.values() if added_vocab else False: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.CONTROL) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) + + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(dir_model, load_merges=False) + special_vocab.merges = merges + special_vocab.add_to_gguf(self.gguf_writer) + logger.info(f"Loaded {len(tokens)} tokens, {len(merges)} merges") + + @staticmethod + def _token_bytes_to_string(b: bytes) -> str: + """Convert bytes to string representation for tokenizer""" + return ''.join([chr(byte) if byte < 128 else f'<0x{byte:02X}>' for byte in b]) + + @staticmethod + def _bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]: + """BPE tokenization for merges extraction""" + parts = [bytes([b]) for b in token] + while True: + min_idx = None + min_rank = None + for i, pair in enumerate(zip(parts[:-1], parts[1:])): + rank = mergeable_ranks.get(pair[0] + pair[1]) + if rank is not None and (min_rank is None or rank < min_rank): + min_idx = i + min_rank = rank + if min_rank is None or (max_rank is not None and min_rank >= max_rank): + break + parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:] + return parts + + def prepare_tensors(self): + super().prepare_tensors() + if self._experts is not None: + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + logger.info(f"Processing {name}: shape before = {tuple(data_torch.shape)}") + + # Handle KDA conv1d weights + # HuggingFace/vLLM stores as [d_inner, d_conv] (2D), memory layout: conv_step changes fastest + # llama.cpp expects ggml ne = [d_conv, 1, d_inner, 1], memory layout: ne[0]=d_conv changes fastest + # GGUF reverses numpy shape when writing, so numpy (1, d_inner, 1, d_conv) -> ggml ne = [d_conv, 1, d_inner, 1] + # Memory layouts match: both have conv_step (d_conv) changing fastest + if name.endswith((".q_conv1d.weight", ".k_conv1d.weight", ".v_conv1d.weight")): + # HF shape: [d_inner, d_conv] e.g. [4096, 4] + # Target numpy shape: (1, d_inner, 1, d_conv) -> ggml ne = [d_conv, 1, d_inner, 1] + if data_torch.ndim == 2: + d_inner, d_conv = data_torch.shape + # Reshape to (1, d_inner, 1, d_conv) - memory layout preserved (d_conv fastest) + data_torch = data_torch.reshape(1, d_inner, 1, d_conv) + logger.info(f"Reshaped conv1d weight {name}: [d_inner={d_inner}, d_conv={d_conv}] -> numpy {tuple(data_torch.shape)} -> ggml ne=[{d_conv}, 1, {d_inner}, 1]") + elif data_torch.ndim == 3: + # Already 3D [d_inner, 1, d_conv] from unsqueeze + d_inner, _, d_conv = data_torch.shape + data_torch = data_torch.reshape(1, d_inner, 1, d_conv) + logger.info(f"Reshaped conv1d weight {name}: [d_inner={d_inner}, 1, d_conv={d_conv}] -> numpy {tuple(data_torch.shape)} -> ggml ne=[{d_conv}, 1, {d_inner}, 1]") + + # Handle A_log: HF stores as [1, 1, num_heads, 1] + # llama.cpp expects ggml ne = [1, num_heads, 1, 1] + # GGUF reverses numpy shape: numpy (1, 1, num_heads, 1) -> ggml ne = [1, num_heads, 1, 1] + # So no transformation needed! The shapes already match after GGUF reversal. + if name.endswith(".A_log"): + if data_torch.ndim == 4: + logger.info(f"A_log {name}: numpy {tuple(data_torch.shape)} -> ggml ne={list(reversed(data_torch.shape))}") + + # Kimi specific bias + if name.endswith("block_sparse_moe.gate.e_score_correction_bias"): + new_name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_EXP_PROBS_B, bid) + return [(new_name, data_torch)] + + # process the experts separately + if name.find("block_sparse_moe.experts") != -1: + n_experts = self.hparams.get("num_local_experts", self.hparams.get("num_experts")) + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + tensors = [] + # w1: gate, w2: down, w3: up + for wid, tname in [("w1", gguf.MODEL_TENSOR.FFN_GATE_EXP), + ("w2", gguf.MODEL_TENSOR.FFN_DOWN_EXP), + ("w3", gguf.MODEL_TENSOR.FFN_UP_EXP)]: + datas: list[Tensor] = [] + for xid in range(n_experts): + ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + new_name = self.format_tensor_name(tname, bid) + tensors.append((new_name, data_torch)) + return tensors + return [] + + mapped_name = self.map_tensor_name(name) + logger.info(f"Returning {mapped_name}: shape after = {tuple(data_torch.shape)}") + return [(mapped_name, data_torch)] + + def get_vocab_base(self) -> tuple[list[str], list[int], str]: + # This method is not used when set_vocab is overridden + # But adding it for completeness in case it's called elsewhere + logger.warning("get_vocab_base called, but set_vocab is already overridden") + vocab_size = self.hparams.get("vocab_size", 100) + tokens = [f"" for i in range(vocab_size)] + tokens[0] = "" + tokens[1] = "" + tokens[2] = "" + toktypes = [gguf.TokenType.NORMAL] * vocab_size + return tokens, toktypes, "gpt-2" + @ModelBase.register("InternLM2ForCausalLM") class InternLM2Model(TextModel): model_arch = gguf.MODEL_ARCH.INTERNLM2 diff --git a/src/models/models.h b/src/models/models.h index d93601ad06..8b7af8d7bc 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -283,6 +283,12 @@ struct llm_build_jamba : public llm_graph_context_mamba { llm_build_jamba(const llama_model & model, const llm_graph_params & params); }; +struct llm_build_kimi_linear : public llm_graph_context_mamba { + llm_build_kimi_linear(const llama_model & model, const llm_graph_params & params); +private: + const llama_model & model; +}; + struct llm_build_lfm2 : public llm_graph_context { const llama_model & model; From 57cca52779d97cf21a5e8fdbb540467353280e0f Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Tue, 2 Dec 2025 10:40:44 +0800 Subject: [PATCH 03/58] kimi linear constants.py tensor_mapping.py --- convert_hf_to_gguf.py | 47 ++------------------------ gguf-py/gguf/constants.py | 61 ++++++++++++++++++++++++++++++++++ gguf-py/gguf/tensor_mapping.py | 37 +++++++++++++++++++++ src/CMakeLists.txt | 1 + 4 files changed, 102 insertions(+), 44 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 11dd9f610a..ba21124d6f 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4988,6 +4988,7 @@ class KimiLinearModel(TextModel): _experts: list[dict[str, Tensor]] | None = None def set_gguf_parameters(self): + super().set_gguf_parameters() self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) # Use find_hparam for context length @@ -5000,14 +5001,6 @@ class KimiLinearModel(TextModel): logger.warning("No context length found in config, defaulting to 4096") self.gguf_writer.add_context_length(4096) - self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_file_type(self.ftype) - # KDA & MLA params # Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv linear_attn_config = self.hparams.get("linear_attn_config", {}) @@ -5053,17 +5046,6 @@ class KimiLinearModel(TextModel): head_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(head_dim) - self.gguf_writer.add_rope_freq_base(self.hparams.get("rope_theta", 10000.0)) - - # MoE params - n_experts = self.hparams.get("num_local_experts", self.hparams.get("num_experts")) - if n_experts is not None: - self.gguf_writer.add_expert_count(n_experts) - # Support both num_experts_per_tok and num_experts_per_token - n_experts_used = self.hparams.get("num_experts_per_tok", self.hparams.get("num_experts_per_token")) - if n_experts_used is not None: - self.gguf_writer.add_expert_used_count(n_experts_used) - # moe_intermediate_size (1024 for Kimi) moe_intermediate_size = self.hparams.get("moe_intermediate_size") if moe_intermediate_size is not None: @@ -5079,16 +5061,6 @@ class KimiLinearModel(TextModel): if first_k_dense_replace is not None: self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace) - # Expert gating function (sigmoid for Kimi) - moe_router_activation_func = self.hparams.get("moe_router_activation_func", "sigmoid") - if moe_router_activation_func == "sigmoid": - self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) - elif moe_router_activation_func == "softmax": - self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX) - else: - logger.warning(f"Unknown moe_router_activation_func: {moe_router_activation_func}, defaulting to sigmoid") - self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) - # Routed scaling factor (expert_weights_scale = 2.446 for Kimi) routed_scaling_factor = self.hparams.get("routed_scaling_factor") if routed_scaling_factor is not None: @@ -5220,9 +5192,8 @@ class KimiLinearModel(TextModel): logger.info(f"A_log {name}: numpy {tuple(data_torch.shape)} -> ggml ne={list(reversed(data_torch.shape))}") # Kimi specific bias - if name.endswith("block_sparse_moe.gate.e_score_correction_bias"): - new_name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_EXP_PROBS_B, bid) - return [(new_name, data_torch)] + if name.endswith("e_score_correction_bias"): + name = name.replace("e_score_correction_bias", "e_score_correction.bias") # process the experts separately if name.find("block_sparse_moe.experts") != -1: @@ -5257,18 +5228,6 @@ class KimiLinearModel(TextModel): logger.info(f"Returning {mapped_name}: shape after = {tuple(data_torch.shape)}") return [(mapped_name, data_torch)] - def get_vocab_base(self) -> tuple[list[str], list[int], str]: - # This method is not used when set_vocab is overridden - # But adding it for completeness in case it's called elsewhere - logger.warning("get_vocab_base called, but set_vocab is already overridden") - vocab_size = self.hparams.get("vocab_size", 100) - tokens = [f"" for i in range(vocab_size)] - tokens[0] = "" - tokens[1] = "" - tokens[2] = "" - toktypes = [gguf.TokenType.NORMAL] * vocab_size - return tokens, toktypes, "gpt-2" - @ModelBase.register("InternLM2ForCausalLM") class InternLM2Model(TextModel): model_arch = gguf.MODEL_ARCH.INTERNLM2 diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 2b8489c591..485c41abfb 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -446,6 +446,7 @@ class MODEL_ARCH(IntEnum): RND1 = auto() PANGU_EMBED = auto() MISTRAL3 = auto() + KIMI_LINEAR = auto() # Kimi-Linear (hybrid MLA+KDA) class VISION_PROJECTOR_TYPE(IntEnum): @@ -535,6 +536,16 @@ class MODEL_TENSOR(IntEnum): SSM_NORM = auto() SSM_OUT = auto() SSM_BETA_ALPHA = auto() # qwen3next + SSM_CONV1D_Q = auto() # Kimi Linear + SSM_CONV1D_K = auto() # Kimi Linear + SSM_CONV1D_V = auto() # Kimi Linear + SSM_F_A = auto() # Kimi Linear + SSM_F_B = auto() # Kimi Linear + SSM_BETA = auto() # Kimi Linear + SSM_A_LOG = auto() # Kimi Linear + SSM_G_A = auto() # Kimi Linear + SSM_G_B = auto() # Kimi Linear + SSM_DT_B = auto() # Kimi Linear TIME_MIX_W0 = auto() TIME_MIX_W1 = auto() TIME_MIX_W2 = auto() @@ -820,6 +831,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.RND1: "rnd1", MODEL_ARCH.PANGU_EMBED: "pangu-embedded", MODEL_ARCH.MISTRAL3: "mistral3", + MODEL_ARCH.KIMI_LINEAR: "kimi-linear", } VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = { @@ -907,6 +919,16 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.SSM_NORM: "blk.{bid}.ssm_norm", MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", MODEL_TENSOR.SSM_BETA_ALPHA: "blk.{bid}.ssm_ba", + MODEL_TENSOR.SSM_CONV1D_Q: "blk.{bid}.ssm_conv1d_q", # Kimi Linear + MODEL_TENSOR.SSM_CONV1D_K: "blk.{bid}.ssm_conv1d_k", # Kimi Linear + MODEL_TENSOR.SSM_CONV1D_V: "blk.{bid}.ssm_conv1d_v", # Kimi Linear + MODEL_TENSOR.SSM_F_A: "blk.{bid}.ssm_f_a", # Kimi Linear + MODEL_TENSOR.SSM_F_B: "blk.{bid}.ssm_f_b", # Kimi Linear + MODEL_TENSOR.SSM_BETA: "blk.{bid}.ssm_beta", # Kimi Linear + MODEL_TENSOR.SSM_A_LOG: "blk.{bid}.ssm_a", # Kimi Linear + MODEL_TENSOR.SSM_G_A: "blk.{bid}.ssm_g_a", # Kimi Linear + MODEL_TENSOR.SSM_G_B: "blk.{bid}.ssm_g_b", # Kimi Linear + MODEL_TENSOR.SSM_DT_B: "blk.{bid}.ssm_dt", # Kimi Linear MODEL_TENSOR.TIME_MIX_W0: "blk.{bid}.time_mix_w0", MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1", MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2", @@ -3094,6 +3116,45 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, ], + MODEL_ARCH.KIMI_LINEAR: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_Q_A, + MODEL_TENSOR.ATTN_Q_B, + MODEL_TENSOR.ATTN_KV_A_MQA, + MODEL_TENSOR.ATTN_KV_B, + MODEL_TENSOR.ATTN_Q_A_NORM, + MODEL_TENSOR.ATTN_KV_A_NORM, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.SSM_CONV1D_Q, + MODEL_TENSOR.SSM_CONV1D_K, + MODEL_TENSOR.SSM_CONV1D_V, + MODEL_TENSOR.SSM_F_A, + MODEL_TENSOR.SSM_F_B, + MODEL_TENSOR.SSM_BETA, + MODEL_TENSOR.SSM_A_LOG, + MODEL_TENSOR.SSM_G_A, + MODEL_TENSOR.SSM_G_B, + MODEL_TENSOR.SSM_NORM, + MODEL_TENSOR.SSM_DT_B, + MODEL_TENSOR.FFN_EXP_PROBS_B, + MODEL_TENSOR.FFN_GATE_SHEXP, + MODEL_TENSOR.FFN_DOWN_SHEXP, + MODEL_TENSOR.FFN_UP_SHEXP, + ], # TODO } diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index a7b0973979..cfe541fc41 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -389,6 +389,7 @@ class TensorNameMap: "model.layers.{bid}.mlp.expert_bias", # afmoe "model.layers.{bid}.feed_forward.expert_bias", # lfm2moe "model.layers.{bid}.block_sparse_moe.e_score_correction", # minimax-m2 + "model.layers.{bid}.block_sparse_moe.gate.e_score_correction", # kimi ), # Feed-forward up @@ -450,6 +451,7 @@ class TensorNameMap: "model.layers.{bid}.feed_forward.shared_expert.up_proj", # llama4 "model.layers.{bid}.feed_forward.down_proj", "model.layers.{bid}.mlp.shared_mlp.up_proj", # hunyuan + "model.layers.{bid}.block_sparse_moe.shared_experts.up_proj", # kimi ), MODEL_TENSOR.FFN_UP_CHEXP: ( @@ -500,6 +502,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_GATE_CHEXP: ( "model.layers.{bid}.mlp.chunk_experts.gate_proj", # grovemoe + "model.layers.{bid}.block_sparse_moe.shared_experts.gate_proj", # kimi ), # Feed-forward down @@ -557,6 +560,7 @@ class TensorNameMap: "model.layers.{bid}.feed_forward.shared_expert.down_proj", # llama4 "model.layers.{bid}.shared_mlp.output_linear", # granitemoe "model.layers.{bid}.mlp.shared_mlp.down_proj", # hunyuan + "model.layers.{bid}.block_sparse_moe.shared_experts.down_proj", # kimi ), MODEL_TENSOR.FFN_DOWN_CHEXP: ( @@ -738,6 +742,7 @@ class TensorNameMap: "model.layers.{bid}.mamba.norm", # falcon-h1 granite-hybrid "model.layers.{bid}.linear_attn.norm", # qwen3next "backbone.layers.{bid}.mixer.norm", # mamba2 + "model.layers.{bid}.self_attn.o_norm", # kimi ), MODEL_TENSOR.SSM_OUT: ( @@ -1569,6 +1574,38 @@ class TensorNameMap: "audio.multi_modal_projector.ln_mid", # ultravox ), + # Kimi Linear KDA (using SSM_ prefix for consistency) + MODEL_TENSOR.SSM_CONV1D_Q: ( + "model.layers.{bid}.self_attn.q_conv1d", + ), + MODEL_TENSOR.SSM_CONV1D_K: ( + "model.layers.{bid}.self_attn.k_conv1d", + ), + MODEL_TENSOR.SSM_CONV1D_V: ( + "model.layers.{bid}.self_attn.v_conv1d", + ), + MODEL_TENSOR.SSM_F_A: ( + "model.layers.{bid}.self_attn.f_a_proj", + ), + MODEL_TENSOR.SSM_F_B: ( + "model.layers.{bid}.self_attn.f_b_proj", + ), + MODEL_TENSOR.SSM_BETA: ( + "model.layers.{bid}.self_attn.b_proj", + ), + MODEL_TENSOR.SSM_A_LOG: ( + "model.layers.{bid}.self_attn.A_log", + ), + MODEL_TENSOR.SSM_G_A: ( + "model.layers.{bid}.self_attn.g_a_proj", + ), + MODEL_TENSOR.SSM_G_B: ( + "model.layers.{bid}.self_attn.g_b_proj", + ), + MODEL_TENSOR.SSM_DT_B: ( + "model.layers.{bid}.self_attn.dt_bias", + ), + # NextN/MTP tensors for GLM4_MOE MODEL_TENSOR.NEXTN_EH_PROJ: ( "model.layers.{bid}.eh_proj", diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index fbd538109b..fbfcf05c70 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -82,6 +82,7 @@ add_library(llama models/internlm2.cpp models/jais.cpp models/jamba.cpp + models/kimi-linear.cpp models/lfm2.cpp models/llada-moe.cpp models/llada.cpp From 6167f39e08173783b8564bdeafb295aa0ddc4574 Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Tue, 2 Dec 2025 11:14:34 +0800 Subject: [PATCH 04/58] Kimi Linear ggml.h --- ggml/include/ggml.h | 23 +++++++++++++++++++++++ gguf-py/gguf/tensor_mapping.py | 1 + 2 files changed, 24 insertions(+) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 48da68fe7e..888f00c2e8 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -539,6 +539,7 @@ extern "C" { GGML_OP_FLASH_ATTN_BACK, GGML_OP_SSM_CONV, GGML_OP_SSM_SCAN, + GGML_OP_KDA_SCAN, GGML_OP_WIN_PART, GGML_OP_WIN_UNPART, GGML_OP_GET_REL_POS, @@ -2337,6 +2338,28 @@ extern "C" { struct ggml_tensor * C, struct ggml_tensor * ids); + // KDA (Kimi Delta Attention) scan + // Delta attention recurrence: + // h[t] = exp(g[t]) * h[t-1] + k[t]^T * (beta[t] * (v[t] - h[t-1] @ k[t])) + // o[t] = q[t]^T @ h[t] + // Parameters: + // h: hidden state {head_dim, head_dim, n_head, n_seqs+} + // q: query {head_dim, n_head, n_seq_tokens, n_seqs} + // k: key {head_dim, n_head, n_seq_tokens, n_seqs} + // v: value {head_dim, n_head, n_seq_tokens, n_seqs} + // g: gate {head_dim, n_head, n_seq_tokens, n_seqs} + // beta: mixing {n_head, n_seq_tokens, n_seqs} + // ids: seq indices {n_seqs} + GGML_API struct ggml_tensor * ggml_kda_scan( + struct ggml_context * ctx, + struct ggml_tensor * h, + struct ggml_tensor * q, + struct ggml_tensor * k, + struct ggml_tensor * v, + struct ggml_tensor * g, + struct ggml_tensor * beta, + struct ggml_tensor * ids); + // partition into non-overlapping windows with padding if needed // example: // a: 768 64 64 1 diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index cfe541fc41..8774ef2dfd 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -498,6 +498,7 @@ class TensorNameMap: "model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek deepseek2 "model.layers.{bid}.feed_forward.shared_expert.gate_proj", # llama4 "model.layers.{bid}.mlp.shared_mlp.gate_proj", # hunyuan + "model.layers.{bid}.block_sparse_moe.shared_experts.gate_proj", # kimi ), MODEL_TENSOR.FFN_GATE_CHEXP: ( From 26a6553155cb735c67b1db01f3901404ee0b8c9e Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Tue, 2 Dec 2025 11:20:46 +0800 Subject: [PATCH 05/58] kimi linear ggml-cpu --- ggml/src/ggml-cpu/ggml-cpu.c | 5 + ggml/src/ggml-cpu/ops.cpp | 196 +++++++++++++++++++++++++++++++++++ ggml/src/ggml-cpu/ops.h | 1 + 3 files changed, 202 insertions(+) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 3247af8bb0..7b40f1e8c2 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -1962,6 +1962,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_ssm_scan(params, tensor); } break; + case GGML_OP_KDA_SCAN: + { + ggml_compute_forward_kda_scan(params, tensor); + } break; case GGML_OP_WIN_PART: { ggml_compute_forward_win_part(params, tensor); @@ -2320,6 +2324,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { case GGML_OP_FLASH_ATTN_BACK: case GGML_OP_SSM_CONV: case GGML_OP_SSM_SCAN: + case GGML_OP_KDA_SCAN: case GGML_OP_RWKV_WKV6: case GGML_OP_GATED_LINEAR_ATTN: case GGML_OP_RWKV_WKV7: diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 608e82af69..9c93e0c101 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -8686,6 +8686,9 @@ static void ggml_compute_forward_ssm_conv_f32( const int ir1 = MIN(ir0 + dr, nr); const int ir = ir1 - ir0; + static int conv_debug_count = 0; + bool do_conv_debug = false; // (ith == 0 && conv_debug_count++ < 3); + for (int i3 = 0; i3 < n_s; ++i3) { for (int i2 = 0; i2 < n_t; ++i2) { // {d_conv - 1 + n_t, d_inner, n_seqs} @@ -8706,6 +8709,13 @@ static void ggml_compute_forward_ssm_conv_f32( sumf += s[i0 + i1*ncs] * c[i0 + i1*nc]; } x[i1] = sumf; + + // Debug output + if (do_conv_debug && i1 == 0 && i2 == 0 && i3 == 0) { + fprintf(stderr, "DEBUG SSM_CONV: nc=%d, nr=%d, n_t=%d, n_s=%d\n", nc, nr, n_t, n_s); + fprintf(stderr, "DEBUG SSM_CONV: s[0..3]=%f,%f,%f,%f, c[0..3]=%f,%f,%f,%f, x[0]=%f\n", + s[0], s[1], s[2], s[3], c[0], c[1], c[2], c[3], x[0]); + } } } } @@ -8956,6 +8966,192 @@ void ggml_compute_forward_ssm_scan( } } +// ggml_compute_forward_kda_scan +// KDA (Kimi Delta Attention) recurrence: +// h[t] = exp(g[t]) * h[t-1] + k[t]^T * (beta[t] * (v[t] - h[t-1] @ k[t])) +// o[t] = q[t]^T @ h[t] + +static void ggml_compute_forward_kda_scan_f32( + const ggml_compute_params * params, + ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; // h {head_dim, head_dim, n_head, n_seqs+} + const ggml_tensor * src1 = dst->src[1]; // q {head_dim, n_head, n_seq_tokens, n_seqs} + const ggml_tensor * src2 = dst->src[2]; // k {head_dim, n_head, n_seq_tokens, n_seqs} + const ggml_tensor * src3 = dst->src[3]; // v {head_dim, n_head, n_seq_tokens, n_seqs} + const ggml_tensor * src4 = dst->src[4]; // g {head_dim, n_head, n_seq_tokens, n_seqs} + const ggml_tensor * src5 = dst->src[5]; // beta {n_head, n_seq_tokens, n_seqs} + const ggml_tensor * src6 = dst->src[6]; // ids {n_seqs} + + const int ith = params->ith; + const int nth = params->nth; + + const int64_t head_dim = src0->ne[0]; + const int64_t n_head = src1->ne[1]; + const int64_t n_seq_tokens = src1->ne[2]; + const int64_t n_seqs = src1->ne[3]; + + // Output offset for hidden state + const int64_t y_off = ggml_nelements(src1) * sizeof(float); + + GGML_ASSERT(src0->nb[0] == sizeof(float)); + GGML_ASSERT(src1->nb[0] == sizeof(float)); + GGML_ASSERT(src2->nb[0] == sizeof(float)); + GGML_ASSERT(src3->nb[0] == sizeof(float)); + GGML_ASSERT(src4->nb[0] == sizeof(float)); + GGML_ASSERT(src5->nb[0] == sizeof(float)); + GGML_ASSERT(src6->nb[0] == sizeof(int32_t)); + + // Parallelize over heads + const int dh = (n_head + nth - 1) / nth; + const int ih0 = dh * ith; + const int ih1 = MIN(ih0 + dh, (int)n_head); + + const int32_t * ids = (const int32_t *) src6->data; + + // Temporary buffer for h @ k computation + float * hk_buf = (float *) malloc(head_dim * sizeof(float)); + + static int debug_count = 0; + bool do_debug = false; // (ith == 0 && debug_count++ < 20); + + for (int i3 = 0; i3 < n_seqs; ++i3) { + // Get initial hidden state for this sequence + const float * h0 = (const float *) ((const char *) src0->data + ids[i3] * src0->nb[3]); + // Output hidden state location + float * h_out = (float *) ((char *) dst->data + i3 * src0->nb[3] + y_off); + + for (int ih = ih0; ih < ih1; ++ih) { + // Per-head hidden state: [head_dim, head_dim] + // Copy initial state to output (will be updated in place) + const float * h_in = h0 + ih * head_dim * head_dim; + float * h = h_out + ih * head_dim * head_dim; + + // Copy initial state, but check for invalid values and clear if needed + bool need_clear = false; + for (int i = 0; i < head_dim * head_dim && !need_clear; ++i) { + if (!isfinite(h_in[i]) || fabsf(h_in[i]) > 1e6f) { + need_clear = true; + } + } + for (int i = 0; i < head_dim * head_dim; ++i) { + h[i] = need_clear ? 0.0f : h_in[i]; + } + + for (int it = 0; it < n_seq_tokens; ++it) { + const float * q_raw = (const float *) ((const char *) src1->data + + it * src1->nb[2] + i3 * src1->nb[3]) + ih * head_dim; + const float * k_raw = (const float *) ((const char *) src2->data + + it * src2->nb[2] + i3 * src2->nb[3]) + ih * head_dim; + const float * v = (const float *) ((const char *) src3->data + + it * src3->nb[2] + i3 * src3->nb[3]) + ih * head_dim; + const float * g = (const float *) ((const char *) src4->data + + it * src4->nb[2] + i3 * src4->nb[3]) + ih * head_dim; + const float beta = ((const float *) ((const char *) src5->data + + it * src5->nb[1] + i3 * src5->nb[2]))[ih]; + + float * y = (float *) dst->data + + it * n_head * head_dim + i3 * n_seq_tokens * n_head * head_dim + ih * head_dim; + + // L2 normalize q and k (critical for KDA stability) + float q_norm = 0.0f, k_norm = 0.0f; + for (int i = 0; i < head_dim; ++i) { + q_norm += q_raw[i] * q_raw[i]; + k_norm += k_raw[i] * k_raw[i]; + } + q_norm = sqrtf(q_norm + 1e-6f); + k_norm = sqrtf(k_norm + 1e-6f); + + // Debug output + if (do_debug && ih == 0 && it == 0 && i3 == 0) { + fprintf(stderr, "DEBUG KDA: q_raw[0]=%f, k_raw[0]=%f, v[0]=%f, g[0]=%f, beta=%f\n", + q_raw[0], k_raw[0], v[0], g[0], beta); + fprintf(stderr, "DEBUG KDA: q_norm=%f, k_norm=%f, exp(g[0])=%f, scale=%f\n", + q_norm, k_norm, expf(g[0]), 1.0f / sqrtf((float)head_dim)); + } + + // Normalized q and k with scale = 1/sqrt(head_dim) + // Note: scale is applied only to q after L2 normalization + const float scale = 1.0f / sqrtf((float)head_dim); + float q[128], k[128]; // assume head_dim <= 128 + for (int i = 0; i < head_dim; ++i) { + // L2 normalize then scale q + q[i] = (q_raw[i] / q_norm) * scale; + // L2 normalize k (no scale) + k[i] = k_raw[i] / k_norm; + } + + // KDA recurrence: h[t] = exp(g[t]) * h[t-1] + k[t]^T * (beta[t] * (v[t] - h[t-1] @ k[t])) + // Note: Apply decay first, then compute retrieval and update + + // Step 1: Apply decay to h first: h = h * exp(g) + for (int i = 0; i < head_dim; ++i) { + const float exp_gi = expf(g[i]); + for (int j = 0; j < head_dim; ++j) { + h[i * head_dim + j] *= exp_gi; + } + } + + // Step 2: Compute h^T @ k -> hk_buf [head_dim] + // hk_buf[j] = sum_i (h[i,j] * k[i]) which is column j of h dotted with k + for (int j = 0; j < head_dim; ++j) { + float sum = 0.0f; + for (int i = 0; i < head_dim; ++i) { + sum += h[i * head_dim + j] * k[i]; + } + hk_buf[j] = sum; + } + + // Step 3: Compute delta = beta * (v - hk) and update h + // h = h + outer(k, delta) where outer(k,delta)[i,j] = k[i] * delta[j] + for (int i = 0; i < head_dim; ++i) { + for (int j = 0; j < head_dim; ++j) { + const float delta_j = beta * (v[j] - hk_buf[j]); + h[i * head_dim + j] += k[i] * delta_j; + } + } + + // Step 4: Compute output y = h^T @ q -> [head_dim] + // vLLM: b_o = tl.sum(b_h * b_q[:, None], 0) means o[j] = sum_i(h[i,j] * q[i]) + for (int j = 0; j < head_dim; ++j) { + float sum = 0.0f; + for (int i = 0; i < head_dim; ++i) { + sum += h[i * head_dim + j] * q[i]; + } + y[j] = sum; + } + + // Debug output + if (do_debug && ih == 0 && it == 0 && i3 == 0) { + // Find max abs value in h for stability check + float h_max = 0.0f; + for (int i = 0; i < head_dim * head_dim; i++) { + if (fabsf(h[i]) > h_max) h_max = fabsf(h[i]); + } + fprintf(stderr, "DEBUG KDA: y[0]=%.6f, h_max=%.6f, exp(g[0])=%.6f\n", + y[0], h_max, expf(g[0])); + } + } + } + } + + free(hk_buf); +} + +void ggml_compute_forward_kda_scan( + const ggml_compute_params * params, + ggml_tensor * dst) { + switch (dst->src[0]->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_kda_scan_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + // ggml_compute_forward_win_part static void ggml_compute_forward_win_part_f32( diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h index 0fdfee7976..080cf6e090 100644 --- a/ggml/src/ggml-cpu/ops.h +++ b/ggml/src/ggml-cpu/ops.h @@ -92,6 +92,7 @@ void ggml_compute_forward_flash_attn_back( struct ggml_tensor * dst); void ggml_compute_forward_ssm_conv(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_ssm_scan(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_kda_scan(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_win_part(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_win_unpart(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_unary(const struct ggml_compute_params * params, struct ggml_tensor * dst); From bf42bc06069f67a520bb9c4d5793e93000df6df9 Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Tue, 2 Dec 2025 11:24:37 +0800 Subject: [PATCH 06/58] Kimi Linear ggml-cuda --- ggml/src/ggml-cuda/ggml-cuda.cu | 9 ++ ggml/src/ggml-cuda/kda-scan.cu | 209 ++++++++++++++++++++++++++++++++ ggml/src/ggml-cuda/kda-scan.cuh | 3 + 3 files changed, 221 insertions(+) create mode 100644 ggml/src/ggml-cuda/kda-scan.cu create mode 100644 ggml/src/ggml-cuda/kda-scan.cuh diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index eb2e273110..8e0d7d916e 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -41,6 +41,7 @@ #include "ggml-cuda/softmax.cuh" #include "ggml-cuda/ssm-conv.cuh" #include "ggml-cuda/ssm-scan.cuh" +#include "ggml-cuda/kda-scan.cuh" #include "ggml-cuda/sum.cuh" #include "ggml-cuda/sumrows.cuh" #include "ggml-cuda/mean.cuh" @@ -2692,6 +2693,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_SSM_SCAN: ggml_cuda_op_ssm_scan(ctx, dst); break; + case GGML_OP_KDA_SCAN: + ggml_cuda_op_kda_scan(ctx, dst); + break; case GGML_OP_ARGSORT: ggml_cuda_op_argsort(ctx, dst); break; @@ -4503,6 +4507,11 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g return op->src[0]->ne[0] == 16 && op->src[0]->ne[1] == 1 && op->src[0]->ne[2] % 128 == 0 && op->src[4]->ne[1] == 1; } } + case GGML_OP_KDA_SCAN: { + // KDA scan kernel supports head_dim 64 or 128 + const int64_t head_dim = op->src[0]->ne[0]; + return head_dim == 64 || head_dim == 128; + } case GGML_OP_SSM_CONV: { // assumes d_inner % threads == 0 return op->src[0]->ne[1] % 128 == 0; diff --git a/ggml/src/ggml-cuda/kda-scan.cu b/ggml/src/ggml-cuda/kda-scan.cu new file mode 100644 index 0000000000..5763f1cc90 --- /dev/null +++ b/ggml/src/ggml-cuda/kda-scan.cu @@ -0,0 +1,209 @@ +#include "kda-scan.cuh" + +// KDA (Kimi Delta Attention) scan CUDA kernel +// Recurrence: +// h[t] = exp(g[t]) * h[t-1] + k[t]^T * (beta[t] * (v[t] - h[t-1] @ k[t])) +// o[t] = q[t]^T @ h[t] +// +// This kernel uses global memory for the hidden state to avoid shared memory limits. +// Each block processes one head for one sequence. + +__global__ void kda_scan_f32_kernel( + const float * __restrict__ src0, // h: [head_dim, head_dim, n_head, n_seqs+] + const float * __restrict__ src1, // q: [head_dim, n_head, n_seq_tokens, n_seqs] + const float * __restrict__ src2, // k: [head_dim, n_head, n_seq_tokens, n_seqs] + const float * __restrict__ src3, // v: [head_dim, n_head, n_seq_tokens, n_seqs] + const float * __restrict__ src4, // g: [head_dim, n_head, n_seq_tokens, n_seqs] + const float * __restrict__ src5, // beta: [n_head, n_seq_tokens, n_seqs] + const int32_t * __restrict__ src6, // ids: [n_seqs] + float * __restrict__ dst, + const int64_t head_dim, + const int64_t n_head, + const int64_t n_seq_tokens, + const int64_t n_seqs, + const int64_t y_off) // offset to state output in dst (in floats) +{ + // Each block handles one head for one sequence + const int seq_idx = blockIdx.x / n_head; + const int head_idx = blockIdx.x % n_head; + const int tid = threadIdx.x; + const int n_threads = blockDim.x; + + if (seq_idx >= n_seqs || head_idx >= n_head) return; + + // Get sequence ID for initial state + const int src_seq = src6[seq_idx]; + + // Shared memory for temporary buffers + extern __shared__ float smem[]; + float * hk_buf = smem; // [head_dim] - h @ k buffer + float * q_norm = smem + head_dim; // [head_dim] - normalized q + float * k_norm = q_norm + head_dim; // [head_dim] - normalized k + float * warp_sums = k_norm + head_dim; // [64] - for reductions + + // Pointers to input/output data for this head + const int64_t h_stride_head = head_dim * head_dim; + const int64_t h_stride_seq = h_stride_head * n_head; + const int64_t qkv_stride_head = head_dim; + const int64_t qkv_stride_token = head_dim * n_head; + const int64_t qkv_stride_seq = qkv_stride_token * n_seq_tokens; + const int64_t beta_stride_token = n_head; + const int64_t beta_stride_seq = beta_stride_token * n_seq_tokens; + + const float * h_in = src0 + src_seq * h_stride_seq + head_idx * h_stride_head; + float * h_out = dst + y_off + seq_idx * h_stride_seq + head_idx * h_stride_head; + float * y_out = dst + seq_idx * qkv_stride_seq + head_idx * qkv_stride_head; + + // Copy initial state to output (we'll update in place) + for (int i = tid; i < head_dim * head_dim; i += n_threads) { + float val = h_in[i]; + if (!isfinite(val) || fabsf(val) > 1e6f) { + val = 0.0f; + } + h_out[i] = val; + } + __syncthreads(); + + const float scale = 1.0f / sqrtf((float)head_dim); + + // Process each token sequentially + for (int t = 0; t < n_seq_tokens; ++t) { + const float * q_raw = src1 + t * qkv_stride_token + seq_idx * qkv_stride_seq + head_idx * qkv_stride_head; + const float * k_raw = src2 + t * qkv_stride_token + seq_idx * qkv_stride_seq + head_idx * qkv_stride_head; + const float * v = src3 + t * qkv_stride_token + seq_idx * qkv_stride_seq + head_idx * qkv_stride_head; + const float * g = src4 + t * qkv_stride_token + seq_idx * qkv_stride_seq + head_idx * qkv_stride_head; + const float beta = src5[t * beta_stride_token + seq_idx * beta_stride_seq + head_idx]; + float * y = y_out + t * qkv_stride_token; + + // Step 1: L2 normalize q and k + float q_sq_sum = 0.0f, k_sq_sum = 0.0f; + for (int i = tid; i < head_dim; i += n_threads) { + q_sq_sum += q_raw[i] * q_raw[i]; + k_sq_sum += k_raw[i] * k_raw[i]; + } + + // Warp reduction + for (int offset = warpSize/2; offset > 0; offset /= 2) { + q_sq_sum += __shfl_down_sync(0xffffffff, q_sq_sum, offset); + k_sq_sum += __shfl_down_sync(0xffffffff, k_sq_sum, offset); + } + + // Cross-warp reduction + int warp_id = tid / warpSize; + int lane_id = tid % warpSize; + if (lane_id == 0 && warp_id < 32) { + warp_sums[warp_id] = q_sq_sum; + warp_sums[32 + warp_id] = k_sq_sum; + } + __syncthreads(); + + if (tid == 0) { + float total_q = 0.0f, total_k = 0.0f; + for (int i = 0; i < (n_threads + warpSize - 1) / warpSize; ++i) { + total_q += warp_sums[i]; + total_k += warp_sums[32 + i]; + } + warp_sums[0] = rsqrtf(total_q + 1e-6f) * scale; + warp_sums[1] = rsqrtf(total_k + 1e-6f); + } + __syncthreads(); + + float q_norm_factor = warp_sums[0]; + float k_norm_factor = warp_sums[1]; + + // Store normalized q and k + for (int i = tid; i < head_dim; i += n_threads) { + q_norm[i] = q_raw[i] * q_norm_factor; + k_norm[i] = k_raw[i] * k_norm_factor; + } + __syncthreads(); + + // KDA recurrence: h[t] = exp(g[t]) * h[t-1] + k[t]^T * (beta[t] * (v[t] - h[t-1] @ k[t])) + // Apply decay first, then compute retrieval and update + + // Step 2: Apply decay to h: h = h * exp(g) + for (int idx = tid; idx < head_dim * head_dim; idx += n_threads) { + int i = idx / head_dim; + float exp_gi = expf(g[i]); + h_out[idx] *= exp_gi; + } + __syncthreads(); + + // Step 3: Compute h^T @ k -> hk_buf + for (int j = tid; j < head_dim; j += n_threads) { + float sum = 0.0f; + for (int i = 0; i < head_dim; ++i) { + sum += h_out[i * head_dim + j] * k_norm[i]; + } + hk_buf[j] = sum; + } + __syncthreads(); + + // Step 4: Update h: h = h + outer(k, beta * (v - hk)) + for (int idx = tid; idx < head_dim * head_dim; idx += n_threads) { + int i = idx / head_dim; + int j = idx % head_dim; + float delta_j = beta * (v[j] - hk_buf[j]); + h_out[idx] += k_norm[i] * delta_j; + } + __syncthreads(); + + // Step 5: Compute output y = h^T @ q + for (int j = tid; j < head_dim; j += n_threads) { + float sum = 0.0f; + for (int i = 0; i < head_dim; ++i) { + sum += h_out[i * head_dim + j] * q_norm[i]; + } + y[j] = sum; + } + __syncthreads(); + } +} + +void ggml_cuda_op_kda_scan(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; // h + const ggml_tensor * src1 = dst->src[1]; // q + const ggml_tensor * src2 = dst->src[2]; // k + const ggml_tensor * src3 = dst->src[3]; // v + const ggml_tensor * src4 = dst->src[4]; // g + const ggml_tensor * src5 = dst->src[5]; // beta + const ggml_tensor * src6 = dst->src[6]; // ids + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(src2->type == GGML_TYPE_F32); + GGML_ASSERT(src3->type == GGML_TYPE_F32); + GGML_ASSERT(src4->type == GGML_TYPE_F32); + GGML_ASSERT(src5->type == GGML_TYPE_F32); + GGML_ASSERT(src6->type == GGML_TYPE_I32); + + const int64_t head_dim = src0->ne[0]; + const int64_t n_head = src1->ne[1]; + const int64_t n_seq_tokens = src1->ne[2]; + const int64_t n_seqs = src1->ne[3]; + + // Output offset for hidden state (after attention output) - in floats + const int64_t y_off = ggml_nelements(src1); + + const float * h_d = (const float *)src0->data; + const float * q_d = (const float *)src1->data; + const float * k_d = (const float *)src2->data; + const float * v_d = (const float *)src3->data; + const float * g_d = (const float *)src4->data; + const float * beta_d = (const float *)src5->data; + const int32_t * ids_d = (const int32_t *)src6->data; + float * dst_d = (float *)dst->data; + + cudaStream_t stream = ctx.stream(); + + // Launch kernel: one block per (sequence, head) pair + const int n_blocks = n_seqs * n_head; + const int n_threads = 128; + + // Shared memory: hk_buf[head_dim] + q_norm[head_dim] + k_norm[head_dim] + warp_sums[64] + size_t smem_size = (3 * head_dim + 64) * sizeof(float); + + kda_scan_f32_kernel<<>>( + h_d, q_d, k_d, v_d, g_d, beta_d, ids_d, dst_d, + head_dim, n_head, n_seq_tokens, n_seqs, y_off); +} diff --git a/ggml/src/ggml-cuda/kda-scan.cuh b/ggml/src/ggml-cuda/kda-scan.cuh new file mode 100644 index 0000000000..55783fb82b --- /dev/null +++ b/ggml/src/ggml-cuda/kda-scan.cuh @@ -0,0 +1,3 @@ +#include "common.cuh" + +void ggml_cuda_op_kda_scan(ggml_backend_cuda_context & ctx, ggml_tensor * dst); From d73d3e51a51d3a58e62471c058f58a1dd4dbae2b Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Tue, 2 Dec 2025 11:27:57 +0800 Subject: [PATCH 07/58] Kimi Linear ggml.c --- ggml/src/ggml.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 17cf4d84bb..8bf562e8b1 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -999,6 +999,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "FLASH_ATTN_BACK", "SSM_CONV", "SSM_SCAN", + "KDA_SCAN", "WIN_PART", "WIN_UNPART", "GET_REL_POS", @@ -1024,7 +1025,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "GLU", }; -static_assert(GGML_OP_COUNT == 95, "GGML_OP_COUNT != 95"); +static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -5434,6 +5435,70 @@ struct ggml_tensor * ggml_ssm_scan( return result; } +// ggml_kda_scan + +struct ggml_tensor * ggml_kda_scan( + struct ggml_context * ctx, + struct ggml_tensor * h, + struct ggml_tensor * q, + struct ggml_tensor * k, + struct ggml_tensor * v, + struct ggml_tensor * g, + struct ggml_tensor * beta, + struct ggml_tensor * ids) { + GGML_ASSERT(ggml_is_contiguous(h)); + GGML_ASSERT(ggml_is_contiguous(q)); + GGML_ASSERT(ggml_is_contiguous(k)); + GGML_ASSERT(ggml_is_contiguous(v)); + GGML_ASSERT(ggml_is_contiguous(g)); + GGML_ASSERT(ggml_is_contiguous(beta)); + GGML_ASSERT(ids->type == GGML_TYPE_I32); + + { + const int64_t head_dim = h->ne[0]; + const int64_t n_head = q->ne[1]; + const int64_t n_seq_tokens = q->ne[2]; + const int64_t n_seqs = q->ne[3]; + + GGML_ASSERT(h->ne[0] == head_dim); + GGML_ASSERT(h->ne[1] == head_dim); + GGML_ASSERT(h->ne[2] == n_head); + GGML_ASSERT(q->ne[0] == head_dim); + GGML_ASSERT(k->ne[0] == head_dim); + GGML_ASSERT(v->ne[0] == head_dim); + GGML_ASSERT(g->ne[0] == head_dim); + GGML_ASSERT(ggml_are_same_shape(q, k)); + GGML_ASSERT(ggml_are_same_shape(q, v)); + GGML_ASSERT(ggml_are_same_shape(q, g)); + GGML_ASSERT(beta->ne[0] == n_head); + GGML_ASSERT(beta->ne[1] == n_seq_tokens); + GGML_ASSERT(beta->ne[2] == n_seqs); + GGML_ASSERT(ids->ne[0] == n_seqs); + GGML_ASSERT(ggml_is_vector(ids)); + } + + // Output: y (attention output) + updated hidden states + // y: {head_dim, n_head, n_seq_tokens, n_seqs} + // h_new: {head_dim, head_dim, n_head, n_seqs} + const int64_t head_dim = h->ne[0]; + const int64_t n_head = q->ne[1]; + const int64_t n_seq_tokens = q->ne[2]; + const int64_t n_seqs = q->ne[3]; + struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, + ggml_nelements(q) + head_dim * head_dim * n_head * n_seqs); + + result->op = GGML_OP_KDA_SCAN; + result->src[0] = h; + result->src[1] = q; + result->src[2] = k; + result->src[3] = v; + result->src[4] = g; + result->src[5] = beta; + result->src[6] = ids; + + return result; +} + // ggml_win_part struct ggml_tensor * ggml_win_part( From e308026f647342f0553cf734d028b8be61a8ed48 Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Tue, 2 Dec 2025 12:02:35 +0800 Subject: [PATCH 08/58] kimi linear src/llama --- src/llama-arch.cpp | 64 ++++++++++++++ src/llama-arch.h | 12 +++ src/llama-context.cpp | 2 +- src/llama-graph.cpp | 13 +-- src/llama-hparams.cpp | 14 +++ src/llama-hparams.h | 4 + src/llama-model.cpp | 195 ++++++++++++++++++++++++++++++++++++++++++ src/llama-model.h | 18 ++++ src/llama-quant.cpp | 4 +- src/llama-vocab.cpp | 41 +++++---- 10 files changed, 343 insertions(+), 24 deletions(-) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index e12c8b9250..ab09bb7eb7 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -112,6 +112,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_RND1, "rnd1" }, { LLM_ARCH_PANGU_EMBED, "pangu-embedded" }, { LLM_ARCH_MISTRAL3, "mistral3" }, + { LLM_ARCH_KIMI_LINEAR, "kimi-linear" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -2540,6 +2541,54 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, }, }, + { + LLM_ARCH_KIMI_LINEAR, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + // Dense FFN (layer 0 only) + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + // MoE FFN (layers 1+) + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" }, + // Shared experts + { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" }, + { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" }, + { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, + // KDA (using SSM_ enum prefix, keeping GGUF names for backward compat) + { LLM_TENSOR_SSM_CONV1D_Q, "blk.%d.ssm_conv1d_q" }, + { LLM_TENSOR_SSM_CONV1D_K, "blk.%d.ssm_conv1d_k" }, + { LLM_TENSOR_SSM_CONV1D_V, "blk.%d.ssm_conv1d_v" }, + { LLM_TENSOR_SSM_F_A, "blk.%d.ssm_f_a" }, + { LLM_TENSOR_SSM_F_B, "blk.%d.ssm_f_b" }, + { LLM_TENSOR_SSM_BETA, "blk.%d.ssm_beta" }, + { LLM_TENSOR_SSM_A_LOG, "blk.%d.ssm_a" }, + { LLM_TENSOR_SSM_DT_B, "blk.%d.ssm_dt" }, + { LLM_TENSOR_SSM_G_A, "blk.%d.ssm_g_a" }, + { LLM_TENSOR_SSM_G_B, "blk.%d.ssm_g_b" }, + { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" }, + // MLA + { LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" }, + { LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" }, + { LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" }, + { LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" }, + { LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" }, + { LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -2644,6 +2693,17 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_SSM_C_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_SSM_D, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_SSM_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + // Kimi KDA - Conv tensors are 4D [d_conv, 1, d_inner, 1], reshaped to 2D at runtime + {LLM_TENSOR_SSM_CONV1D_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_SSM_CONV1D_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_SSM_CONV1D_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_SSM_F_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_SSM_F_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_SSM_BETA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_SSM_A_LOG, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_SSM_DT_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, + {LLM_TENSOR_SSM_G_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_SSM_G_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_TIME_MIX_LERP_X, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_TIME_MIX_LN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_CHANNEL_MIX_LERP_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, @@ -2801,6 +2861,7 @@ bool llm_arch_is_recurrent(const llm_arch & arch) { case LLM_ARCH_RWKV6QWEN2: case LLM_ARCH_RWKV7: case LLM_ARCH_ARWKV7: + case LLM_ARCH_KIMI_LINEAR: // KDA layers use delta attention with recurrent state return true; default: return false; @@ -2817,6 +2878,9 @@ bool llm_arch_is_hybrid(const llm_arch & arch) { case LLM_ARCH_LFM2MOE: case LLM_ARCH_NEMOTRON_H: case LLM_ARCH_QWEN3NEXT: + // Kimi: Currently using recurrent-only mode since MLA doesn't use KV cache + // TODO: Enable hybrid when MLA KV caching is implemented + // case LLM_ARCH_KIMI_LINEAR: return true; default: return false; diff --git a/src/llama-arch.h b/src/llama-arch.h index 438963cef0..2b965850c5 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -116,6 +116,7 @@ enum llm_arch { LLM_ARCH_RND1, LLM_ARCH_PANGU_EMBED, LLM_ARCH_MISTRAL3, + LLM_ARCH_KIMI_LINEAR, LLM_ARCH_UNKNOWN, }; @@ -385,6 +386,17 @@ enum llm_tensor { LLM_TENSOR_SSM_NORM, LLM_TENSOR_SSM_OUT, LLM_TENSOR_SSM_BETA_ALPHA, // qwen3next + // Kimi Linear KDA (using SSM_ prefix for consistency) + LLM_TENSOR_SSM_CONV1D_Q, // kimi: Q conv1d weight + LLM_TENSOR_SSM_CONV1D_K, // kimi: K conv1d weight + LLM_TENSOR_SSM_CONV1D_V, // kimi: V conv1d weight + LLM_TENSOR_SSM_F_A, // kimi: forget gate projection A + LLM_TENSOR_SSM_F_B, // kimi: forget gate projection B + LLM_TENSOR_SSM_BETA, // kimi: beta mixing coefficient + LLM_TENSOR_SSM_A_LOG, // kimi: A_log (pre-converted in GGUF) + LLM_TENSOR_SSM_DT_B, // kimi: dt bias + LLM_TENSOR_SSM_G_A, // kimi: output gate projection A + LLM_TENSOR_SSM_G_B, // kimi: output gate projection B LLM_TENSOR_TIME_MIX_W0, LLM_TENSOR_TIME_MIX_W1, LLM_TENSOR_TIME_MIX_W2, diff --git a/src/llama-context.cpp b/src/llama-context.cpp index e04f0fc4f9..3278cf2ef8 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1387,7 +1387,7 @@ void llama_context::output_reorder() { // uint32_t llama_context::graph_max_nodes() const { - if (model.arch == LLM_ARCH_QWEN3NEXT) { + if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_KIMI_LINEAR) { return std::max(8192u, 32u*model.n_tensors()); } return std::max(1024u, 8u*model.n_tensors()); diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 42ccb5b76a..e41d65398f 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1816,11 +1816,14 @@ ggml_tensor * llm_graph_context::build_rs( ggml_build_forward_expand(gf, output_states); // copy extra states which won't be changed further (between n_seqs and n_rs) - ggml_tensor * states_extra = ggml_get_rows(ctx0, states, state_copy_extra); - ggml_build_forward_expand(gf, - ggml_cpy(ctx0, - states_extra, - ggml_view_1d(ctx0, s, state_size*(n_rs - n_seqs), (rs_head + n_seqs)*state_size*ggml_element_size(s)))); + // Skip if there are no extra states to copy (n_rs == n_seqs) + if (arch != LLM_ARCH_KIMI_LINEAR || n_rs > n_seqs) { // arch check for backward compat + ggml_tensor * states_extra = ggml_get_rows(ctx0, states, state_copy_extra); + ggml_build_forward_expand(gf, + ggml_cpy(ctx0, + states_extra, + ggml_view_1d(ctx0, s, state_size*(n_rs - n_seqs), (rs_head + n_seqs)*state_size*ggml_element_size(s)))); + } return output_states; } diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 8cdbaf69fc..88d266b8da 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -133,6 +133,13 @@ uint32_t llama_hparams::n_embd_r() const { return n_embd * (n_shortconv_l_cache - 1); } + if (kda_head_dim != 0) { + // for Kimi KDA layers + // Conv state for Q, K, V: 3 * (d_conv - 1) * n_head * head_dim + const uint32_t d_inner = n_head() * kda_head_dim; // 32 * 128 = 4096 + return 3 * (kda_d_conv > 0 ? kda_d_conv - 1 : 3) * d_inner; + } + // TODO: maybe support other convolution strides than 1 // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed // Corresponds to Mamba's conv_states size @@ -145,6 +152,13 @@ uint32_t llama_hparams::n_embd_s() const { return n_embd * wkv_head_size; } + if (kda_head_dim != 0) { + // for Kimi KDA layers + // Full recurrent state: head_dim * head_dim * n_head + // h tensor shape for delta attention: [head_dim, head_dim, n_head] + return kda_head_dim * kda_head_dim * n_head(); // 128 * 128 * 32 = 524288 + } + // corresponds to Mamba's ssm_states size return ssm_d_state * ssm_d_inner; } diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 6eff334a5f..80170650eb 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -133,6 +133,10 @@ struct llama_hparams { uint32_t ssm_dt_rank = 0; uint32_t ssm_n_group = 0; + // for Kimi Delta Attention (KDA) + uint32_t kda_head_dim = 0; // head_dim for KDA layers (128 for Kimi) + uint32_t kda_d_conv = 0; // conv kernel size for KDA (4 for Kimi) + // for hybrid state space models std::array recurrent_layer_arr; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 584efbf3c8..763f0dfecb 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2283,6 +2283,54 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_KIMI_LINEAR: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla, false); + ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false); + ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv, false); + ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false); + + // KDA (Delta Attention) parameters + hparams.kda_head_dim = 128; // linear_attn_config.head_dim + hparams.kda_d_conv = 4; // linear_attn_config.short_conv_kernel_size + + // MLA qk_rope_head_dim (for reference) + // qk_rope_head_dim = 64, qk_nope_head_dim = 128, qk_head_dim = 192 + + // Mark KDA layers as recurrent using n_head_kv pattern (like Jamba) + // MLA layers are at: 3, 7, 11, 15, 19, 23, 26 (7 MLA layers total) + // KDA layers are all others: 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25 (20 KDA layers) + // Set n_head_kv = 0 for KDA layers (recurrent), n_head_kv = n_head for MLA layers (attention) + for (uint32_t i = 0; i < hparams.n_layer; ++i) { + bool is_mla = (i == 3 || i == 7 || i == 11 || i == 15 || i == 19 || i == 23 || i == 26); + hparams.n_head_kv_arr[i] = is_mla ? hparams.n_head() : 0; + hparams.recurrent_layer_arr[i] = !is_mla; // KDA layers are recurrent + } + + // MoE parameters - Kimi uses moe_intermediate_size = 1024 + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); + ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false); + ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false); + ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false); + ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); + ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false); + + // Default values if not in GGUF + if (hparams.n_ff_exp == 0) hparams.n_ff_exp = 1024; // moe_intermediate_size + if (hparams.n_ff_shexp == 0) hparams.n_ff_shexp = 9216; // shared_expert_intermediate_size = intermediate_size + if (hparams.n_expert_shared == 0) hparams.n_expert_shared = 1; // num_shared_experts + if (hparams.n_layer_dense_lead == 0) hparams.n_layer_dense_lead = 1; // first_k_dense_replace + if (hparams.expert_weights_scale == 0.0f) hparams.expert_weights_scale = 2.446f; // routed_scaling_factor + + // MoE gating function - Kimi uses sigmoid (moe_router_activation_func: sigmoid) + if (hparams.expert_gating_func == 0) hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID; + + switch (hparams.n_layer) { + case 27: type = LLM_TYPE_48B; break; // Kimi-Linear-48B-A3B + default: type = LLM_TYPE_UNKNOWN; + } + } break; default: throw std::runtime_error("unsupported model architecture"); } @@ -6395,6 +6443,148 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0); } } break; + case LLM_ARCH_KIMI_LINEAR: + { + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + // Check for KDA specific tensors to determine layer type or if it's a mixed model + // Assuming KDA layer if KDA tensors are present + + // KDA uses head_dim = 128 (from linear_attn_config.head_dim) + const int64_t n_embd_head_k_kda = 128; + const int64_t n_embd_head_v_kda = 128; + const int64_t ssm_d_conv = hparams.ssm_d_conv > 0 ? hparams.ssm_d_conv : 4; + + // Try loading KDA specific tensors (using SSM_ prefix) + // Conv1d weights: try 4D first, then 3D (quantization may remove trailing 1) + // 4D: [d_conv, 1, d_inner, 1], 3D: [d_conv, 1, d_inner] + layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED); + if (!layer.ssm_q_conv) { + layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head}, TENSOR_NOT_REQUIRED); + } + + if (layer.ssm_q_conv) { + // KDA Layer - Conv1d weights may be 3D or 4D + layer.ssm_k_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_K, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED); + if (!layer.ssm_k_conv) { + layer.ssm_k_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_K, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head}, 0); + } + layer.ssm_v_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_V, "weight", i), {ssm_d_conv, 1, n_embd_head_v_kda * n_head, 1}, TENSOR_NOT_REQUIRED); + if (!layer.ssm_v_conv) { + layer.ssm_v_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_V, "weight", i), {ssm_d_conv, 1, n_embd_head_v_kda * n_head}, 0); + } + + // Conv bias may not exist in all models - make optional + layer.ssm_q_conv_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "bias", i), {n_embd_head_k_kda * n_head}, TENSOR_NOT_REQUIRED); + layer.ssm_k_conv_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_K, "bias", i), {n_embd_head_k_kda * n_head}, TENSOR_NOT_REQUIRED); + layer.ssm_v_conv_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_V, "bias", i), {n_embd_head_v_kda * n_head}, TENSOR_NOT_REQUIRED); + + // q, k, v projections + // Python: q_proj, k_proj, v_proj + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k_kda * n_head}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_head_k_kda * n_head}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_head_v_kda * n_head}, 0); + + // KDA specific projections + // f_a_proj, f_b_proj + layer.ssm_f_a = create_tensor(tn(LLM_TENSOR_SSM_F_A, "weight", i), {n_embd, n_embd_head_k_kda}, 0); // head_dim + layer.ssm_f_b = create_tensor(tn(LLM_TENSOR_SSM_F_B, "weight", i), {n_embd_head_k_kda, n_embd_head_k_kda * n_head}, 0); // projection_size + + // b_proj (beta mixing coefficient) + layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", i), {n_embd, n_head}, 0); + + // A_log - Shape in GGUF: [1, num_heads, 1, 1] (4D) or [1, num_heads] (2D after quantization) + layer.ssm_a_log = create_tensor(tn(LLM_TENSOR_SSM_A_LOG, i), {1, n_head, 1, 1}, TENSOR_NOT_REQUIRED); + if (!layer.ssm_a_log) { + layer.ssm_a_log = create_tensor(tn(LLM_TENSOR_SSM_A_LOG, i), {1, n_head}, 0); + } + + // dt_bias - shape [n_embd_head_k_kda * n_head] = [4096] + layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT_B, i), {n_embd_head_k_kda * n_head}, 0); + + // g_a_proj, g_b_proj (output gate) + layer.ssm_g_a = create_tensor(tn(LLM_TENSOR_SSM_G_A, "weight", i), {n_embd, n_embd_head_k_kda}, 0); + layer.ssm_g_b = create_tensor(tn(LLM_TENSOR_SSM_G_B, "weight", i), {n_embd_head_k_kda, n_embd_head_k_kda * n_head}, 0); + + // o_norm (reusing SSM_NORM) + layer.ssm_o_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {n_embd_head_k_kda}, 0); // FusedRMSNormGated + layer.ssm_o_norm_b = create_tensor(tn(LLM_TENSOR_SSM_NORM, "bias", i), {n_embd_head_k_kda}, TENSOR_NOT_REQUIRED); + + // o_proj + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v_kda * n_head, n_embd}, 0); + + } else { + // MLA Layer - use MLA-specific head dimensions + const int64_t q_lora_rank = hparams.n_lora_q; + const int64_t kv_lora_rank = hparams.n_lora_kv; + const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla > 0 ? hparams.n_embd_head_k_mla : 192; + const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla > 0 ? hparams.n_embd_head_v_mla : 128; + + layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, TENSOR_NOT_REQUIRED); + layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0); + + if (layer.attn_q_a_norm) { + layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0); + layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0); + } else { + // Kimi MLA without Q compression: wq = [n_embd, n_head * n_embd_head_k_mla] + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0); + } + + // Kimi: qk_rope_head_dim = 64 (actual RoPE dimension for MLA) + // Note: hparams.n_rot may be 72 (from conversion) but actual is 64 + const int64_t qk_rope_head_dim = 64; // From config: qk_rope_head_dim + layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + qk_rope_head_dim}, 0); + layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_k_mla - qk_rope_head_dim + n_embd_head_v_mla)}, 0); + + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0); + } + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + + // MoE intermediate size (different from dense FFN) + const int64_t n_ff_exp = hparams.n_ff_exp > 0 ? hparams.n_ff_exp : 1024; + + // Kimi uses n_layer_dense_lead to determine which layers use dense FFN vs MoE + // first_k_dense_replace = 1 means layer 0 uses dense FFN, layers 1+ use MoE + if (i < (int) hparams.n_layer_dense_lead) { + // Dense FFN layer - use normal n_ff + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + } else { + // MoE layer - use n_ff_exp (1024) instead of n_ff (9216) + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0); + layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0); + layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0); + + // Shared experts use moe_intermediate_size * num_shared_experts + // Kimi: shared_expert_intermediate_size = 1024 * 1 = 1024 + // Tensors are 2D: [n_embd, n_ff_shexp] or [n_ff_shexp, n_embd] + const int64_t n_ff_shexp_actual = n_ff_exp * (hparams.n_expert_shared > 0 ? hparams.n_expert_shared : 1); + layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp_actual}, TENSOR_NOT_REQUIRED); + layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp_actual, n_embd}, TENSOR_NOT_REQUIRED); + layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp_actual}, TENSOR_NOT_REQUIRED); + + // exp_probs_b (e_score_correction_bias in vLLM) + // Try "bias" first (standard), then "weight" (for compatibility) + layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED); + if (!layer.ffn_exp_probs_b) { + layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "weight", i), {n_expert}, TENSOR_NOT_REQUIRED); + } + } + } + } break; case LLM_ARCH_COGVLM: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -7563,6 +7753,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { { llm = std::make_unique(*this, params); } break; + case LLM_ARCH_KIMI_LINEAR: + { + llm = std::make_unique(*this, params); + } break; default: GGML_ABORT("fatal error"); } @@ -7718,6 +7912,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_ARCTIC: case LLM_ARCH_DEEPSEEK: case LLM_ARCH_DEEPSEEK2: + case LLM_ARCH_KIMI_LINEAR: case LLM_ARCH_PLM: case LLM_ARCH_CHATGLM: case LLM_ARCH_GLM4: diff --git a/src/llama-model.h b/src/llama-model.h index f8342cf2cb..b067b686d2 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -84,6 +84,7 @@ enum llm_type { LLM_TYPE_35B, LLM_TYPE_36B, LLM_TYPE_40B, + LLM_TYPE_48B, LLM_TYPE_65B, LLM_TYPE_70B, LLM_TYPE_120B, @@ -404,6 +405,23 @@ struct llama_layer { struct ggml_tensor * ffn_act_beta = nullptr; struct ggml_tensor * ffn_act_eps = nullptr; + // Kimi Linear KDA (using ssm_ prefix for consistency) + // Note: ssm_dt_b already exists above (mamba bias), reused for Kimi dt_bias + struct ggml_tensor * ssm_q_conv = nullptr; + struct ggml_tensor * ssm_q_conv_b = nullptr; + struct ggml_tensor * ssm_k_conv = nullptr; + struct ggml_tensor * ssm_k_conv_b = nullptr; + struct ggml_tensor * ssm_v_conv = nullptr; + struct ggml_tensor * ssm_v_conv_b = nullptr; + struct ggml_tensor * ssm_f_a = nullptr; + struct ggml_tensor * ssm_f_b = nullptr; + struct ggml_tensor * ssm_beta = nullptr; + struct ggml_tensor * ssm_a_log = nullptr; + struct ggml_tensor * ssm_g_a = nullptr; + struct ggml_tensor * ssm_g_b = nullptr; + struct ggml_tensor * ssm_o_norm = nullptr; + struct ggml_tensor * ssm_o_norm_b = nullptr; + struct llama_layer_posnet posnet; struct llama_layer_convnext convnext; diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 0b23eaef3a..7b8bf6e524 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -724,7 +724,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer; // sanity checks for models that have attention layers - if (qs.n_attention_wv != 0 && !is_clip_model) + // Skip this check for Kimi models which have hybrid KDA+MLA architecture + // (only MLA layers have attn_kv_b weights, KDA layers don't) + if (qs.n_attention_wv != 0 && !is_clip_model && model.arch != LLM_ARCH_KIMI_LINEAR) { const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin(); // attention layers have a non-zero number of kv heads diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index a73c4c448b..7af74b0218 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1738,26 +1738,33 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { // read bpe merges and populate bpe ranks const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str()); + // Kimi-K2 uses custom tokenization without traditional BPE merges + const bool is_kimi_k2 = (tokenizer_pre == "kimi-k2"); + if (merges_keyidx == -1) { - throw std::runtime_error("cannot find tokenizer merges in model file\n"); - } - - const int n_merges = gguf_get_arr_n(ctx, merges_keyidx); - for (int i = 0; i < n_merges; i++) { - const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i); - //GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0); - - std::string first; - std::string second; - - const size_t pos = word.find(' ', 1); - - if (pos != std::string::npos) { - first = word.substr(0, pos); - second = word.substr(pos + 1); + if (!is_kimi_k2) { + throw std::runtime_error("cannot find tokenizer merges in model file\n"); } + // Kimi-K2 doesn't need merges, skip + LLAMA_LOG_INFO("%s: Kimi-K2 tokenizer detected, skipping BPE merges\n", __func__); + } else { + const int n_merges = gguf_get_arr_n(ctx, merges_keyidx); + for (int i = 0; i < n_merges; i++) { + const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i); + //GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0); - bpe_ranks.emplace(std::make_pair(first, second), i); + std::string first; + std::string second; + + const size_t pos = word.find(' ', 1); + + if (pos != std::string::npos) { + first = word.substr(0, pos); + second = word.substr(pos + 1); + } + + bpe_ranks.emplace(std::make_pair(first, second), i); + } } // default special tokens From 139548d07011c27a719c0ab24f79073fe1cd0e1f Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Tue, 2 Dec 2025 12:11:15 +0800 Subject: [PATCH 09/58] remove "const int64_t n_seq_tokens = q->ne[2];" to get rid of unused variable warning --- ggml/src/ggml.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 8bf562e8b1..1703cad5ab 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1134,7 +1134,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "glu(x)", }; -static_assert(GGML_OP_COUNT == 95, "GGML_OP_COUNT != 95"); +static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -5482,7 +5482,6 @@ struct ggml_tensor * ggml_kda_scan( // h_new: {head_dim, head_dim, n_head, n_seqs} const int64_t head_dim = h->ne[0]; const int64_t n_head = q->ne[1]; - const int64_t n_seq_tokens = q->ne[2]; const int64_t n_seqs = q->ne[3]; struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ggml_nelements(q) + head_dim * head_dim * n_head * n_seqs); From 83d328d0d38ce81abb92c77f89b8e1567833ad1f Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Tue, 2 Dec 2025 14:09:02 +0800 Subject: [PATCH 10/58] remove type mismatch warning --- src/llama-graph.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index e41d65398f..f877267289 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1817,7 +1817,7 @@ ggml_tensor * llm_graph_context::build_rs( // copy extra states which won't be changed further (between n_seqs and n_rs) // Skip if there are no extra states to copy (n_rs == n_seqs) - if (arch != LLM_ARCH_KIMI_LINEAR || n_rs > n_seqs) { // arch check for backward compat + if (arch != LLM_ARCH_KIMI_LINEAR || n_rs > (u_int32_t) n_seqs) { // arch check for backward compat ggml_tensor * states_extra = ggml_get_rows(ctx0, states, state_copy_extra); ggml_build_forward_expand(gf, ggml_cpy(ctx0, From 772ca88070eb6ae079641adf6e3e8eb289952104 Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Tue, 2 Dec 2025 20:16:24 +0800 Subject: [PATCH 11/58] read MoE params --- convert_hf_to_gguf.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index ba21124d6f..2808b72d76 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -5046,6 +5046,17 @@ class KimiLinearModel(TextModel): head_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(head_dim) + self.gguf_writer.add_rope_freq_base(self.hparams.get("rope_theta", 10000.0)) + + # MoE params + n_experts = self.hparams.get("num_local_experts", self.hparams.get("num_experts")) + if n_experts is not None: + self.gguf_writer.add_expert_count(n_experts) + # Support both num_experts_per_tok and num_experts_per_token + n_experts_used = self.hparams.get("num_experts_per_tok", self.hparams.get("num_experts_per_token")) + if n_experts_used is not None: + self.gguf_writer.add_expert_used_count(n_experts_used) + # moe_intermediate_size (1024 for Kimi) moe_intermediate_size = self.hparams.get("moe_intermediate_size") if moe_intermediate_size is not None: From 9f1265fec16598cc9c24ae31ae38c3ae7aaa3bde Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Fri, 5 Dec 2025 19:51:02 +0800 Subject: [PATCH 12/58] removed some hard coded code --- convert_hf_to_gguf.py | 25 +++++++++++++++-- src/llama-model.cpp | 21 +++----------- src/models/kimi-linear.cpp | 57 +++++++++++++++++++++++--------------- 3 files changed, 61 insertions(+), 42 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 2808b72d76..9c36c84189 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4990,7 +4990,9 @@ class KimiLinearModel(TextModel): def set_gguf_parameters(self): super().set_gguf_parameters() self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) - + self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"]) + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) + # Use find_hparam for context length # Kimi uses model_max_length n_ctx = self.find_hparam(["max_position_embeddings", "model_max_length", "n_ctx", "n_positions"], optional=True) @@ -5004,6 +5006,18 @@ class KimiLinearModel(TextModel): # KDA & MLA params # Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv linear_attn_config = self.hparams.get("linear_attn_config", {}) + # n_head == 0 for KDA layers, n_head > 0 for MLA layers + # full_attention_layers list will be used to distingush layer type + _num_kv_heads = list() + _full_attn_layers = linear_attn_config["full_attn_layers"] + for il in range(self.hparams["num_hidden_layers"]): + if il+1 in _full_attn_layers: + _num_kv_heads.append(linear_attn_config["num_heads"]) + else: + _num_kv_heads.append(0) + assert(len(_num_kv_heads) == self.hparams["num_hidden_layers"]) + self.gguf_writer.add_head_count_kv(_num_kv_heads) + ssm_d_conv = self.hparams.get("ssm_d_conv") or linear_attn_config.get("short_conv_kernel_size") if ssm_d_conv is not None: self.gguf_writer.add_ssm_conv_kernel(ssm_d_conv) @@ -5046,7 +5060,14 @@ class KimiLinearModel(TextModel): head_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(head_dim) - self.gguf_writer.add_rope_freq_base(self.hparams.get("rope_theta", 10000.0)) + # Copied from Qwen2Moe as this model inherits parts of it + # YaRN is not enabled by default + # To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts + rope_scaling = self.hparams.get("rope_scaling") or {} + if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) # MoE params n_experts = self.hparams.get("num_local_experts", self.hparams.get("num_experts")) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 763f0dfecb..0f162cdd7a 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -120,6 +120,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_16B_A1B: return "16B.A1B"; case LLM_TYPE_21B_A3B: return "21B.A3B"; case LLM_TYPE_30B_A3B: return "30B.A3B"; + case LLM_TYPE_48B_A3B: return "48B.A3B"; case LLM_TYPE_100B_A6B: return "100B.A6B"; case LLM_TYPE_106B_A12B: return "106B.A12B"; case LLM_TYPE_230B_A10B: return "230B.A10B"; @@ -2299,13 +2300,9 @@ void llama_model::load_hparams(llama_model_loader & ml) { // qk_rope_head_dim = 64, qk_nope_head_dim = 128, qk_head_dim = 192 // Mark KDA layers as recurrent using n_head_kv pattern (like Jamba) - // MLA layers are at: 3, 7, 11, 15, 19, 23, 26 (7 MLA layers total) - // KDA layers are all others: 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25 (20 KDA layers) // Set n_head_kv = 0 for KDA layers (recurrent), n_head_kv = n_head for MLA layers (attention) for (uint32_t i = 0; i < hparams.n_layer; ++i) { - bool is_mla = (i == 3 || i == 7 || i == 11 || i == 15 || i == 19 || i == 23 || i == 26); - hparams.n_head_kv_arr[i] = is_mla ? hparams.n_head() : 0; - hparams.recurrent_layer_arr[i] = !is_mla; // KDA layers are recurrent + hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0; // KDA layers are recurrent } // MoE parameters - Kimi uses moe_intermediate_size = 1024 @@ -2316,18 +2313,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false); - // Default values if not in GGUF - if (hparams.n_ff_exp == 0) hparams.n_ff_exp = 1024; // moe_intermediate_size - if (hparams.n_ff_shexp == 0) hparams.n_ff_shexp = 9216; // shared_expert_intermediate_size = intermediate_size - if (hparams.n_expert_shared == 0) hparams.n_expert_shared = 1; // num_shared_experts - if (hparams.n_layer_dense_lead == 0) hparams.n_layer_dense_lead = 1; // first_k_dense_replace - if (hparams.expert_weights_scale == 0.0f) hparams.expert_weights_scale = 2.446f; // routed_scaling_factor - - // MoE gating function - Kimi uses sigmoid (moe_router_activation_func: sigmoid) - if (hparams.expert_gating_func == 0) hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID; - switch (hparams.n_layer) { - case 27: type = LLM_TYPE_48B; break; // Kimi-Linear-48B-A3B + case 27: type = LLM_TYPE_48B_A3B; break; // Kimi-Linear-48B-A3B default: type = LLM_TYPE_UNKNOWN; } } break; @@ -7894,6 +7881,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_ARWKV7: case LLM_ARCH_WAVTOKENIZER_DEC: case LLM_ARCH_NEMOTRON_H: + case LLM_ARCH_KIMI_LINEAR: return LLAMA_ROPE_TYPE_NONE; // use what we call a normal RoPE, operating on pairs of consecutive head values @@ -7912,7 +7900,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_ARCTIC: case LLM_ARCH_DEEPSEEK: case LLM_ARCH_DEEPSEEK2: - case LLM_ARCH_KIMI_LINEAR: case LLM_ARCH_PLM: case LLM_ARCH_CHATGLM: case LLM_ARCH_GLM4: diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index 660cd06f0e..40fbe469b3 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -339,6 +339,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll ggml_row_size(kv->type, n_embd_head_qk_nope)); k_nope = ggml_cont(ctx0, k_nope); Vcur = ggml_cont(ctx0, Vcur); + cb(Vcur, "mla_V", il); // Concatenate k_nope + k_pe (broadcast k_pe to all heads) // K = [k_nope, k_pe] where k_nope is [qk_nope_head_dim, n_head, n_tokens] @@ -349,12 +350,11 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll ggml_tensor * k_pe_repeated = ggml_repeat(ctx0, k_pe, k_pe_target); ggml_tensor * Kcur = ggml_concat(ctx0, k_nope, k_pe_repeated, 0); cb(Kcur, "mla_K", il); - cb(Vcur, "mla_V", il); // Direct softmax attention (without KV cache) // Use build_attn with inp_no_cache for proper mask handling - cur = build_attn(inp_no_cache, layer.wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il); - cb(cur, "mla_out", il); + cur = build_attn(inp_no_cache, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il); +// cb(cur, "mla_out", il); } else { // Unknown layer type - this should not happen @@ -375,18 +375,33 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll cur = build_norm(ffn_inp, layer.ffn_norm, NULL, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - // FFN / MoE - if (layer.ffn_gate_inp) { + if ((uint32_t) il < hparams.n_layer_dense_lead) { + // Dense FFN layer + cur = build_ffn(cur, + layer.ffn_up, NULL, NULL, + layer.ffn_gate, NULL, NULL, + layer.ffn_down, NULL, NULL, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } else { // MoE layer // Kimi uses moe_renormalize=True and routed_scaling_factor (stored as expert_weights_scale) = 2.446 - ggml_tensor * moe_out = build_moe_ffn(cur, layer.ffn_gate_inp, layer.ffn_up_exps, layer.ffn_gate_exps, layer.ffn_down_exps, - layer.ffn_exp_probs_b, hparams.n_expert, hparams.n_expert_used, - LLM_FFN_SILU, true, true, hparams.expert_weights_scale, - (llama_expert_gating_func_type) hparams.expert_gating_func, il); + ggml_tensor * moe_out = build_moe_ffn(cur, + layer.ffn_gate_inp, + layer.ffn_up_exps, + layer.ffn_gate_exps, + layer.ffn_down_exps, + layer.ffn_exp_probs_b, + hparams.n_expert, + hparams.n_expert_used, + LLM_FFN_SILU, true, + true, hparams.expert_weights_scale, + (llama_expert_gating_func_type) hparams.expert_gating_func, + il); cb(moe_out, "ffn_moe_out", il); - // Shared expert (if present) - if (layer.ffn_gate_shexp) { + // Shared expert + { ggml_tensor * ffn_shexp = build_ffn(cur, layer.ffn_up_shexp, NULL, NULL, layer.ffn_gate_shexp, NULL, NULL, @@ -396,27 +411,23 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll cur = ggml_add(ctx0, moe_out, ffn_shexp); cb(cur, "ffn_out", il); - } else { - cur = moe_out; } - } else if (layer.ffn_gate) { - // Dense FFN layer - cur = build_ffn(cur, layer.ffn_up, NULL, NULL, layer.ffn_gate, NULL, NULL, - layer.ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); - cb(cur, "ffn_out", il); - } else { - // No FFN - this should not happen in Kimi - GGML_ABORT("Kimi layer missing FFN tensors"); } - // Residual cur = ggml_add(ctx0, cur, ffn_inp); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + inpL = cur; } + cur = inpL; // Final Norm - cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1); + cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res->t_embd = cur; // Output cur = ggml_mul_mat(ctx0, model.output, cur); From a0269af2928ff17468f54ee837693d20f3d0647d Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Sat, 6 Dec 2025 11:51:16 +0800 Subject: [PATCH 13/58] removed all hard code --- convert_hf_to_gguf.py | 10 +++++++++- gguf-py/gguf/constants.py | 6 ++++++ gguf-py/gguf/gguf_writer.py | 3 +++ src/llama-arch.cpp | 2 ++ src/llama-arch.h | 2 ++ src/llama-hparams.cpp | 2 +- src/llama-hparams.h | 5 ++--- src/llama-model.cpp | 20 +++++++++----------- src/llama-model.h | 2 +- src/models/kimi-linear.cpp | 14 +++++++------- 10 files changed, 42 insertions(+), 24 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 9c36c84189..45538fcabb 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -5021,6 +5021,13 @@ class KimiLinearModel(TextModel): ssm_d_conv = self.hparams.get("ssm_d_conv") or linear_attn_config.get("short_conv_kernel_size") if ssm_d_conv is not None: self.gguf_writer.add_ssm_conv_kernel(ssm_d_conv) + + kda_head_dim = self.hparams.get("kda_head_dim") or linear_attn_config.get("head_dim") + + if kda_head_dim is not None: + self.gguf_writer.add_kda_head_dim(kda_head_dim) + + # MLA params - use add_* methods that handle arch substitution # MLA params - use add_* methods that handle arch substitution # Support both HuggingFace naming (q_lora_rank, kv_lora_rank) and internal naming (n_lora_q, n_lora_kv) @@ -5035,8 +5042,9 @@ class KimiLinearModel(TextModel): # MLA head dimensions # Support HuggingFace naming: qk_nope_head_dim, qk_rope_head_dim, v_head_dim qk_nope_head_dim = self.hparams.get("qk_nope_head_dim") - qk_rope_head_dim = self.hparams.get("qk_rope_head_dim", self.hparams.get("n_rot")) + qk_rope_head_dim = self.hparams.get("qk_rope_head_dim") v_head_dim = self.hparams.get("v_head_dim") + self.gguf_writer.add_rope_dimension_count(self.hparams["qk_rope_head_dim"]) # Calculate n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim if "n_embd_head_k_mla" in self.hparams: diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 485c41abfb..fe9785918b 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -205,6 +205,9 @@ class Keys: GROUP_COUNT = "{arch}.ssm.group_count" DT_B_C_RMS = "{arch}.ssm.dt_b_c_rms" + class KDA: + HEAD_DIM = "{arch}.kda.head_dim" + class WKV: HEAD_SIZE = "{arch}.wkv.head_size" @@ -3475,6 +3478,9 @@ KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK KEY_SSM_GROUP_COUNT = Keys.SSM.GROUP_COUNT KEY_SSM_DT_B_C_RMS = Keys.SSM.DT_B_C_RMS +# KDA +KEY_KDA_HEAD_DIM = Keys.KDA.HEAD_DIM + # tokenization KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL KEY_TOKENIZER_PRE = Keys.Tokenizer.PRE diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 9e6ff3ac77..3b2dfef479 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -970,6 +970,9 @@ class GGUFWriter: def add_ssm_dt_b_c_rms(self, value: bool) -> None: self.add_bool(Keys.SSM.DT_B_C_RMS.format(arch=self.arch), value) + def add_kda_head_dim(self, value: int) -> None: + self.add_uint32(Keys.KDA.HEAD_DIM.format(arch=self.arch), value) + def add_tokenizer_model(self, model: str) -> None: self.add_string(Keys.Tokenizer.MODEL, model) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index ab09bb7eb7..6aabdb7f7d 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -236,6 +236,8 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_SSM_GROUP_COUNT, "%s.ssm.group_count" }, { LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" }, + { LLM_KV_KDA_HEAD_DIM, "%s.kda.head_dim" }, + { LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" }, { LLM_KV_POSNET_EMBEDDING_LENGTH, "%s.posnet.embedding_length" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index 2b965850c5..d68af214a7 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -240,6 +240,8 @@ enum llm_kv { LLM_KV_SSM_GROUP_COUNT, LLM_KV_SSM_DT_B_C_RMS, + LLM_KV_KDA_HEAD_DIM, + LLM_KV_WKV_HEAD_SIZE, LLM_KV_TOKENIZER_MODEL, diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 88d266b8da..75ddeeba09 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -137,7 +137,7 @@ uint32_t llama_hparams::n_embd_r() const { // for Kimi KDA layers // Conv state for Q, K, V: 3 * (d_conv - 1) * n_head * head_dim const uint32_t d_inner = n_head() * kda_head_dim; // 32 * 128 = 4096 - return 3 * (kda_d_conv > 0 ? kda_d_conv - 1 : 3) * d_inner; + return 3 * (ssm_d_conv > 0 ? ssm_d_conv - 1 : 3) * d_inner; } // TODO: maybe support other convolution strides than 1 diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 80170650eb..c90ed12b90 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -133,9 +133,8 @@ struct llama_hparams { uint32_t ssm_dt_rank = 0; uint32_t ssm_n_group = 0; - // for Kimi Delta Attention (KDA) - uint32_t kda_head_dim = 0; // head_dim for KDA layers (128 for Kimi) - uint32_t kda_d_conv = 0; // conv kernel size for KDA (4 for Kimi) + // for Kimi Linear KDA + uint32_t kda_head_dim = 0; // for hybrid state space models std::array recurrent_layer_arr; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 0f162cdd7a..2e3cb9d78c 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2291,10 +2291,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false); ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv, false); ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false); - - // KDA (Delta Attention) parameters - hparams.kda_head_dim = 128; // linear_attn_config.head_dim - hparams.kda_d_conv = 4; // linear_attn_config.short_conv_kernel_size + ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv, false); + ml.get_key(LLM_KV_KDA_HEAD_DIM, hparams.kda_head_dim, false); // MLA qk_rope_head_dim (for reference) // qk_rope_head_dim = 64, qk_nope_head_dim = 128, qk_head_dim = 192 @@ -6447,9 +6445,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // Assuming KDA layer if KDA tensors are present // KDA uses head_dim = 128 (from linear_attn_config.head_dim) - const int64_t n_embd_head_k_kda = 128; - const int64_t n_embd_head_v_kda = 128; - const int64_t ssm_d_conv = hparams.ssm_d_conv > 0 ? hparams.ssm_d_conv : 4; + const int64_t n_embd_head_k_kda = hparams.kda_head_dim; + const int64_t n_embd_head_v_kda = hparams.kda_head_dim; + const int64_t ssm_d_conv = hparams.ssm_d_conv; // Try loading KDA specific tensors (using SSM_ prefix) // Conv1d weights: try 4D first, then 3D (quantization may remove trailing 1) @@ -6513,8 +6511,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // MLA Layer - use MLA-specific head dimensions const int64_t q_lora_rank = hparams.n_lora_q; const int64_t kv_lora_rank = hparams.n_lora_kv; - const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla > 0 ? hparams.n_embd_head_k_mla : 192; - const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla > 0 ? hparams.n_embd_head_v_mla : 128; + const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla; + const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla; layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, TENSOR_NOT_REQUIRED); layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0); @@ -6529,7 +6527,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // Kimi: qk_rope_head_dim = 64 (actual RoPE dimension for MLA) // Note: hparams.n_rot may be 72 (from conversion) but actual is 64 - const int64_t qk_rope_head_dim = 64; // From config: qk_rope_head_dim + const int64_t qk_rope_head_dim = hparams.n_rot; // From config: qk_rope_head_dim layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + qk_rope_head_dim}, 0); layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_k_mla - qk_rope_head_dim + n_embd_head_v_mla)}, 0); @@ -6539,7 +6537,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); // MoE intermediate size (different from dense FFN) - const int64_t n_ff_exp = hparams.n_ff_exp > 0 ? hparams.n_ff_exp : 1024; + const int64_t n_ff_exp = hparams.n_ff_exp; // Kimi uses n_layer_dense_lead to determine which layers use dense FFN vs MoE // first_k_dense_replace = 1 means layer 0 uses dense FFN, layers 1+ use MoE diff --git a/src/llama-model.h b/src/llama-model.h index b067b686d2..7081423588 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -84,7 +84,6 @@ enum llm_type { LLM_TYPE_35B, LLM_TYPE_36B, LLM_TYPE_40B, - LLM_TYPE_48B, LLM_TYPE_65B, LLM_TYPE_70B, LLM_TYPE_120B, @@ -114,6 +113,7 @@ enum llm_type { LLM_TYPE_16B_A1B, LLM_TYPE_21B_A3B, // Ernie MoE small LLM_TYPE_30B_A3B, + LLM_TYPE_48B_A3B, // Kimi Linear LLM_TYPE_80B_A3B, // Qwen3 Next LLM_TYPE_100B_A6B, LLM_TYPE_106B_A12B, // GLM-4.5-Air diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index 40fbe469b3..d025eab5f3 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -21,8 +21,8 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll // Kimi dimension constants const int64_t n_head = hparams.n_head(); - const int64_t head_dim = hparams.kda_head_dim > 0 ? hparams.kda_head_dim : 128; - const int64_t d_conv = hparams.kda_d_conv > 0 ? hparams.kda_d_conv : 4; + const int64_t head_dim = hparams.kda_head_dim; + const int64_t d_conv = hparams.ssm_d_conv; const int64_t d_inner = n_head * head_dim; // 32 * 128 = 4096 const int64_t n_seqs = ubatch.n_seqs; const int64_t n_seq_tokens = ubatch.n_seq_tokens; @@ -33,12 +33,12 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); // MLA params - const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla > 0 ? hparams.n_embd_head_k_mla : 192; - const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla > 0 ? hparams.n_embd_head_v_mla : 128; - const int64_t kv_lora_rank = hparams.n_lora_kv > 0 ? hparams.n_lora_kv : 512; - // qk_rope_head_dim = 64 (from Kimi config), NOT hparams.n_rot (which is 72) + const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla; + const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla; + const int64_t kv_lora_rank = hparams.n_lora_kv; + // qk_rope_head_dim = 64 (from Kimi config) which is hparams.n_rot // Confirmed from tensor shape: wkv_a_mqa [2304, 576] = [n_embd, kv_lora_rank + qk_rope_head_dim] - const int64_t n_embd_head_qk_rope = 64; // config.qk_rope_head_dim + const int64_t n_embd_head_qk_rope = hparams.n_rot; // config.qk_rope_head_dim const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope; // 192 - 64 = 128 // Attention scale for KDA (1/sqrt(head_dim)) From ef5bc3054484c4696ead7289395e617893086050 Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Sun, 14 Dec 2025 17:43:30 +0800 Subject: [PATCH 14/58] use DeepseekV2 tokenizer --- convert_hf_to_gguf.py | 104 +++++++++++++++++++----------------------- 1 file changed, 46 insertions(+), 58 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 45538fcabb..ac353c7dda 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -5107,71 +5107,59 @@ class KimiLinearModel(TextModel): self.gguf_writer.add_expert_weights_scale(routed_scaling_factor) def set_vocab(self): - # Kimi uses TikToken tokenizer - load via transformers + try: + self._set_vocab_gpt2() + return + except Exception: + pass + from transformers import AutoTokenizer - - dir_model = self.dir_model - vocab_size = self.hparams["vocab_size"] - - logger.info(f"Loading TikToken tokenizer from {dir_model}") - tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) - - tokens: list[str] = [] - toktypes: list[int] = [] - - # Get tokenizer pre string + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) tokpre = self.get_vocab_base_pre(tokenizer) - - # Build vocab from tokenizer - merges = [] - vocab = {} - - # TikToken stores vocab in mergeable_ranks - if hasattr(tokenizer, 'mergeable_ranks'): - mergeable_ranks = tokenizer.mergeable_ranks + + if tokpre == "kimi-k2": + # Build merges list using the approach similar to HunYuanMoE + merges = [] + vocab = {} + mergeable_ranks = tokenizer.model._mergeable_ranks for token, rank in mergeable_ranks.items(): - vocab[self._token_bytes_to_string(token)] = rank + vocab[QwenModel.token_bytes_to_string(token)] = rank if len(token) == 1: continue - # Build merges - merged = self._bpe(mergeable_ranks, token, max_rank=rank) + merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) if len(merged) == 2: - merges.append(' '.join(map(self._token_bytes_to_string, merged))) + merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) + + # Build token list + vocab_size = self.hparams["vocab_size"] + special_tokens = tokenizer.special_tokens + reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()} + tokens: list[str] = [] + toktypes: list[int] = [] + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + else: + token = reverse_vocab[i] + tokens.append(token) + if i in special_tokens.values(): + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.NORMAL) + + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_token_merges(merges) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) + special_vocab.add_to_gguf(self.gguf_writer) else: - # Fallback: get vocab directly - vocab = {tok: idx for tok, idx in tokenizer.get_vocab().items()} - - # Get special tokens - added_vocab = {} - if hasattr(tokenizer, 'special_tokens'): - added_vocab = tokenizer.special_tokens - elif hasattr(tokenizer, 'added_tokens_encoder'): - added_vocab = tokenizer.added_tokens_encoder - - # Combine vocab - reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()} - - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.UNUSED) - elif i in added_vocab.values() if added_vocab else False: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.CONTROL) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(dir_model, load_merges=False) - special_vocab.merges = merges - special_vocab.add_to_gguf(self.gguf_writer) - logger.info(f"Loaded {len(tokens)} tokens, {len(merges)} merges") - + raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!") + @staticmethod def _token_bytes_to_string(b: bytes) -> str: """Convert bytes to string representation for tokenizer""" From ae9771d1dca927ab4b6e37510d4dd36cd40fe77a Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Thu, 18 Dec 2025 08:14:15 +0800 Subject: [PATCH 15/58] removed unnecessary internal methods called by the old set_vocab of KimiLinear --- convert_hf_to_gguf.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index ac353c7dda..c6724f2ed5 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -5160,28 +5160,6 @@ class KimiLinearModel(TextModel): else: raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!") - @staticmethod - def _token_bytes_to_string(b: bytes) -> str: - """Convert bytes to string representation for tokenizer""" - return ''.join([chr(byte) if byte < 128 else f'<0x{byte:02X}>' for byte in b]) - - @staticmethod - def _bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]: - """BPE tokenization for merges extraction""" - parts = [bytes([b]) for b in token] - while True: - min_idx = None - min_rank = None - for i, pair in enumerate(zip(parts[:-1], parts[1:])): - rank = mergeable_ranks.get(pair[0] + pair[1]) - if rank is not None and (min_rank is None or rank < min_rank): - min_idx = i - min_rank = rank - if min_rank is None or (max_rank is not None and min_rank >= max_rank): - break - parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:] - return parts - def prepare_tensors(self): super().prepare_tensors() if self._experts is not None: From f9a11d7758924a3ab2bf59eab97f4379ef7086d7 Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Thu, 18 Dec 2025 20:46:10 +0800 Subject: [PATCH 16/58] rewrite get_vocab for KimiLinear. Removed all kda_scan code --- ggml/src/ggml-cpu/ggml-cpu.c | 4 - ggml/src/ggml-cpu/ops.cpp | 187 ---------------------------- ggml/src/ggml-cpu/ops.h | 1 - ggml/src/ggml-cuda/ggml-cuda.cu | 4 - ggml/src/ggml-cuda/kda-scan.cu | 209 -------------------------------- ggml/src/ggml-cuda/kda-scan.cuh | 3 - ggml/src/ggml.c | 63 ---------- 7 files changed, 471 deletions(-) delete mode 100644 ggml/src/ggml-cuda/kda-scan.cu delete mode 100644 ggml/src/ggml-cuda/kda-scan.cuh diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 7b40f1e8c2..4cc15b0981 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -1962,10 +1962,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_ssm_scan(params, tensor); } break; - case GGML_OP_KDA_SCAN: - { - ggml_compute_forward_kda_scan(params, tensor); - } break; case GGML_OP_WIN_PART: { ggml_compute_forward_win_part(params, tensor); diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 9c93e0c101..a5ecd35e34 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -8686,7 +8686,6 @@ static void ggml_compute_forward_ssm_conv_f32( const int ir1 = MIN(ir0 + dr, nr); const int ir = ir1 - ir0; - static int conv_debug_count = 0; bool do_conv_debug = false; // (ith == 0 && conv_debug_count++ < 3); for (int i3 = 0; i3 < n_s; ++i3) { @@ -8966,192 +8965,6 @@ void ggml_compute_forward_ssm_scan( } } -// ggml_compute_forward_kda_scan -// KDA (Kimi Delta Attention) recurrence: -// h[t] = exp(g[t]) * h[t-1] + k[t]^T * (beta[t] * (v[t] - h[t-1] @ k[t])) -// o[t] = q[t]^T @ h[t] - -static void ggml_compute_forward_kda_scan_f32( - const ggml_compute_params * params, - ggml_tensor * dst) { - const ggml_tensor * src0 = dst->src[0]; // h {head_dim, head_dim, n_head, n_seqs+} - const ggml_tensor * src1 = dst->src[1]; // q {head_dim, n_head, n_seq_tokens, n_seqs} - const ggml_tensor * src2 = dst->src[2]; // k {head_dim, n_head, n_seq_tokens, n_seqs} - const ggml_tensor * src3 = dst->src[3]; // v {head_dim, n_head, n_seq_tokens, n_seqs} - const ggml_tensor * src4 = dst->src[4]; // g {head_dim, n_head, n_seq_tokens, n_seqs} - const ggml_tensor * src5 = dst->src[5]; // beta {n_head, n_seq_tokens, n_seqs} - const ggml_tensor * src6 = dst->src[6]; // ids {n_seqs} - - const int ith = params->ith; - const int nth = params->nth; - - const int64_t head_dim = src0->ne[0]; - const int64_t n_head = src1->ne[1]; - const int64_t n_seq_tokens = src1->ne[2]; - const int64_t n_seqs = src1->ne[3]; - - // Output offset for hidden state - const int64_t y_off = ggml_nelements(src1) * sizeof(float); - - GGML_ASSERT(src0->nb[0] == sizeof(float)); - GGML_ASSERT(src1->nb[0] == sizeof(float)); - GGML_ASSERT(src2->nb[0] == sizeof(float)); - GGML_ASSERT(src3->nb[0] == sizeof(float)); - GGML_ASSERT(src4->nb[0] == sizeof(float)); - GGML_ASSERT(src5->nb[0] == sizeof(float)); - GGML_ASSERT(src6->nb[0] == sizeof(int32_t)); - - // Parallelize over heads - const int dh = (n_head + nth - 1) / nth; - const int ih0 = dh * ith; - const int ih1 = MIN(ih0 + dh, (int)n_head); - - const int32_t * ids = (const int32_t *) src6->data; - - // Temporary buffer for h @ k computation - float * hk_buf = (float *) malloc(head_dim * sizeof(float)); - - static int debug_count = 0; - bool do_debug = false; // (ith == 0 && debug_count++ < 20); - - for (int i3 = 0; i3 < n_seqs; ++i3) { - // Get initial hidden state for this sequence - const float * h0 = (const float *) ((const char *) src0->data + ids[i3] * src0->nb[3]); - // Output hidden state location - float * h_out = (float *) ((char *) dst->data + i3 * src0->nb[3] + y_off); - - for (int ih = ih0; ih < ih1; ++ih) { - // Per-head hidden state: [head_dim, head_dim] - // Copy initial state to output (will be updated in place) - const float * h_in = h0 + ih * head_dim * head_dim; - float * h = h_out + ih * head_dim * head_dim; - - // Copy initial state, but check for invalid values and clear if needed - bool need_clear = false; - for (int i = 0; i < head_dim * head_dim && !need_clear; ++i) { - if (!isfinite(h_in[i]) || fabsf(h_in[i]) > 1e6f) { - need_clear = true; - } - } - for (int i = 0; i < head_dim * head_dim; ++i) { - h[i] = need_clear ? 0.0f : h_in[i]; - } - - for (int it = 0; it < n_seq_tokens; ++it) { - const float * q_raw = (const float *) ((const char *) src1->data + - it * src1->nb[2] + i3 * src1->nb[3]) + ih * head_dim; - const float * k_raw = (const float *) ((const char *) src2->data + - it * src2->nb[2] + i3 * src2->nb[3]) + ih * head_dim; - const float * v = (const float *) ((const char *) src3->data + - it * src3->nb[2] + i3 * src3->nb[3]) + ih * head_dim; - const float * g = (const float *) ((const char *) src4->data + - it * src4->nb[2] + i3 * src4->nb[3]) + ih * head_dim; - const float beta = ((const float *) ((const char *) src5->data + - it * src5->nb[1] + i3 * src5->nb[2]))[ih]; - - float * y = (float *) dst->data + - it * n_head * head_dim + i3 * n_seq_tokens * n_head * head_dim + ih * head_dim; - - // L2 normalize q and k (critical for KDA stability) - float q_norm = 0.0f, k_norm = 0.0f; - for (int i = 0; i < head_dim; ++i) { - q_norm += q_raw[i] * q_raw[i]; - k_norm += k_raw[i] * k_raw[i]; - } - q_norm = sqrtf(q_norm + 1e-6f); - k_norm = sqrtf(k_norm + 1e-6f); - - // Debug output - if (do_debug && ih == 0 && it == 0 && i3 == 0) { - fprintf(stderr, "DEBUG KDA: q_raw[0]=%f, k_raw[0]=%f, v[0]=%f, g[0]=%f, beta=%f\n", - q_raw[0], k_raw[0], v[0], g[0], beta); - fprintf(stderr, "DEBUG KDA: q_norm=%f, k_norm=%f, exp(g[0])=%f, scale=%f\n", - q_norm, k_norm, expf(g[0]), 1.0f / sqrtf((float)head_dim)); - } - - // Normalized q and k with scale = 1/sqrt(head_dim) - // Note: scale is applied only to q after L2 normalization - const float scale = 1.0f / sqrtf((float)head_dim); - float q[128], k[128]; // assume head_dim <= 128 - for (int i = 0; i < head_dim; ++i) { - // L2 normalize then scale q - q[i] = (q_raw[i] / q_norm) * scale; - // L2 normalize k (no scale) - k[i] = k_raw[i] / k_norm; - } - - // KDA recurrence: h[t] = exp(g[t]) * h[t-1] + k[t]^T * (beta[t] * (v[t] - h[t-1] @ k[t])) - // Note: Apply decay first, then compute retrieval and update - - // Step 1: Apply decay to h first: h = h * exp(g) - for (int i = 0; i < head_dim; ++i) { - const float exp_gi = expf(g[i]); - for (int j = 0; j < head_dim; ++j) { - h[i * head_dim + j] *= exp_gi; - } - } - - // Step 2: Compute h^T @ k -> hk_buf [head_dim] - // hk_buf[j] = sum_i (h[i,j] * k[i]) which is column j of h dotted with k - for (int j = 0; j < head_dim; ++j) { - float sum = 0.0f; - for (int i = 0; i < head_dim; ++i) { - sum += h[i * head_dim + j] * k[i]; - } - hk_buf[j] = sum; - } - - // Step 3: Compute delta = beta * (v - hk) and update h - // h = h + outer(k, delta) where outer(k,delta)[i,j] = k[i] * delta[j] - for (int i = 0; i < head_dim; ++i) { - for (int j = 0; j < head_dim; ++j) { - const float delta_j = beta * (v[j] - hk_buf[j]); - h[i * head_dim + j] += k[i] * delta_j; - } - } - - // Step 4: Compute output y = h^T @ q -> [head_dim] - // vLLM: b_o = tl.sum(b_h * b_q[:, None], 0) means o[j] = sum_i(h[i,j] * q[i]) - for (int j = 0; j < head_dim; ++j) { - float sum = 0.0f; - for (int i = 0; i < head_dim; ++i) { - sum += h[i * head_dim + j] * q[i]; - } - y[j] = sum; - } - - // Debug output - if (do_debug && ih == 0 && it == 0 && i3 == 0) { - // Find max abs value in h for stability check - float h_max = 0.0f; - for (int i = 0; i < head_dim * head_dim; i++) { - if (fabsf(h[i]) > h_max) h_max = fabsf(h[i]); - } - fprintf(stderr, "DEBUG KDA: y[0]=%.6f, h_max=%.6f, exp(g[0])=%.6f\n", - y[0], h_max, expf(g[0])); - } - } - } - } - - free(hk_buf); -} - -void ggml_compute_forward_kda_scan( - const ggml_compute_params * params, - ggml_tensor * dst) { - switch (dst->src[0]->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_kda_scan_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - // ggml_compute_forward_win_part static void ggml_compute_forward_win_part_f32( diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h index 080cf6e090..0fdfee7976 100644 --- a/ggml/src/ggml-cpu/ops.h +++ b/ggml/src/ggml-cpu/ops.h @@ -92,7 +92,6 @@ void ggml_compute_forward_flash_attn_back( struct ggml_tensor * dst); void ggml_compute_forward_ssm_conv(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_ssm_scan(const struct ggml_compute_params * params, struct ggml_tensor * dst); -void ggml_compute_forward_kda_scan(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_win_part(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_win_unpart(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_unary(const struct ggml_compute_params * params, struct ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 8e0d7d916e..bbd7810dd2 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -41,7 +41,6 @@ #include "ggml-cuda/softmax.cuh" #include "ggml-cuda/ssm-conv.cuh" #include "ggml-cuda/ssm-scan.cuh" -#include "ggml-cuda/kda-scan.cuh" #include "ggml-cuda/sum.cuh" #include "ggml-cuda/sumrows.cuh" #include "ggml-cuda/mean.cuh" @@ -2693,9 +2692,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_SSM_SCAN: ggml_cuda_op_ssm_scan(ctx, dst); break; - case GGML_OP_KDA_SCAN: - ggml_cuda_op_kda_scan(ctx, dst); - break; case GGML_OP_ARGSORT: ggml_cuda_op_argsort(ctx, dst); break; diff --git a/ggml/src/ggml-cuda/kda-scan.cu b/ggml/src/ggml-cuda/kda-scan.cu deleted file mode 100644 index 5763f1cc90..0000000000 --- a/ggml/src/ggml-cuda/kda-scan.cu +++ /dev/null @@ -1,209 +0,0 @@ -#include "kda-scan.cuh" - -// KDA (Kimi Delta Attention) scan CUDA kernel -// Recurrence: -// h[t] = exp(g[t]) * h[t-1] + k[t]^T * (beta[t] * (v[t] - h[t-1] @ k[t])) -// o[t] = q[t]^T @ h[t] -// -// This kernel uses global memory for the hidden state to avoid shared memory limits. -// Each block processes one head for one sequence. - -__global__ void kda_scan_f32_kernel( - const float * __restrict__ src0, // h: [head_dim, head_dim, n_head, n_seqs+] - const float * __restrict__ src1, // q: [head_dim, n_head, n_seq_tokens, n_seqs] - const float * __restrict__ src2, // k: [head_dim, n_head, n_seq_tokens, n_seqs] - const float * __restrict__ src3, // v: [head_dim, n_head, n_seq_tokens, n_seqs] - const float * __restrict__ src4, // g: [head_dim, n_head, n_seq_tokens, n_seqs] - const float * __restrict__ src5, // beta: [n_head, n_seq_tokens, n_seqs] - const int32_t * __restrict__ src6, // ids: [n_seqs] - float * __restrict__ dst, - const int64_t head_dim, - const int64_t n_head, - const int64_t n_seq_tokens, - const int64_t n_seqs, - const int64_t y_off) // offset to state output in dst (in floats) -{ - // Each block handles one head for one sequence - const int seq_idx = blockIdx.x / n_head; - const int head_idx = blockIdx.x % n_head; - const int tid = threadIdx.x; - const int n_threads = blockDim.x; - - if (seq_idx >= n_seqs || head_idx >= n_head) return; - - // Get sequence ID for initial state - const int src_seq = src6[seq_idx]; - - // Shared memory for temporary buffers - extern __shared__ float smem[]; - float * hk_buf = smem; // [head_dim] - h @ k buffer - float * q_norm = smem + head_dim; // [head_dim] - normalized q - float * k_norm = q_norm + head_dim; // [head_dim] - normalized k - float * warp_sums = k_norm + head_dim; // [64] - for reductions - - // Pointers to input/output data for this head - const int64_t h_stride_head = head_dim * head_dim; - const int64_t h_stride_seq = h_stride_head * n_head; - const int64_t qkv_stride_head = head_dim; - const int64_t qkv_stride_token = head_dim * n_head; - const int64_t qkv_stride_seq = qkv_stride_token * n_seq_tokens; - const int64_t beta_stride_token = n_head; - const int64_t beta_stride_seq = beta_stride_token * n_seq_tokens; - - const float * h_in = src0 + src_seq * h_stride_seq + head_idx * h_stride_head; - float * h_out = dst + y_off + seq_idx * h_stride_seq + head_idx * h_stride_head; - float * y_out = dst + seq_idx * qkv_stride_seq + head_idx * qkv_stride_head; - - // Copy initial state to output (we'll update in place) - for (int i = tid; i < head_dim * head_dim; i += n_threads) { - float val = h_in[i]; - if (!isfinite(val) || fabsf(val) > 1e6f) { - val = 0.0f; - } - h_out[i] = val; - } - __syncthreads(); - - const float scale = 1.0f / sqrtf((float)head_dim); - - // Process each token sequentially - for (int t = 0; t < n_seq_tokens; ++t) { - const float * q_raw = src1 + t * qkv_stride_token + seq_idx * qkv_stride_seq + head_idx * qkv_stride_head; - const float * k_raw = src2 + t * qkv_stride_token + seq_idx * qkv_stride_seq + head_idx * qkv_stride_head; - const float * v = src3 + t * qkv_stride_token + seq_idx * qkv_stride_seq + head_idx * qkv_stride_head; - const float * g = src4 + t * qkv_stride_token + seq_idx * qkv_stride_seq + head_idx * qkv_stride_head; - const float beta = src5[t * beta_stride_token + seq_idx * beta_stride_seq + head_idx]; - float * y = y_out + t * qkv_stride_token; - - // Step 1: L2 normalize q and k - float q_sq_sum = 0.0f, k_sq_sum = 0.0f; - for (int i = tid; i < head_dim; i += n_threads) { - q_sq_sum += q_raw[i] * q_raw[i]; - k_sq_sum += k_raw[i] * k_raw[i]; - } - - // Warp reduction - for (int offset = warpSize/2; offset > 0; offset /= 2) { - q_sq_sum += __shfl_down_sync(0xffffffff, q_sq_sum, offset); - k_sq_sum += __shfl_down_sync(0xffffffff, k_sq_sum, offset); - } - - // Cross-warp reduction - int warp_id = tid / warpSize; - int lane_id = tid % warpSize; - if (lane_id == 0 && warp_id < 32) { - warp_sums[warp_id] = q_sq_sum; - warp_sums[32 + warp_id] = k_sq_sum; - } - __syncthreads(); - - if (tid == 0) { - float total_q = 0.0f, total_k = 0.0f; - for (int i = 0; i < (n_threads + warpSize - 1) / warpSize; ++i) { - total_q += warp_sums[i]; - total_k += warp_sums[32 + i]; - } - warp_sums[0] = rsqrtf(total_q + 1e-6f) * scale; - warp_sums[1] = rsqrtf(total_k + 1e-6f); - } - __syncthreads(); - - float q_norm_factor = warp_sums[0]; - float k_norm_factor = warp_sums[1]; - - // Store normalized q and k - for (int i = tid; i < head_dim; i += n_threads) { - q_norm[i] = q_raw[i] * q_norm_factor; - k_norm[i] = k_raw[i] * k_norm_factor; - } - __syncthreads(); - - // KDA recurrence: h[t] = exp(g[t]) * h[t-1] + k[t]^T * (beta[t] * (v[t] - h[t-1] @ k[t])) - // Apply decay first, then compute retrieval and update - - // Step 2: Apply decay to h: h = h * exp(g) - for (int idx = tid; idx < head_dim * head_dim; idx += n_threads) { - int i = idx / head_dim; - float exp_gi = expf(g[i]); - h_out[idx] *= exp_gi; - } - __syncthreads(); - - // Step 3: Compute h^T @ k -> hk_buf - for (int j = tid; j < head_dim; j += n_threads) { - float sum = 0.0f; - for (int i = 0; i < head_dim; ++i) { - sum += h_out[i * head_dim + j] * k_norm[i]; - } - hk_buf[j] = sum; - } - __syncthreads(); - - // Step 4: Update h: h = h + outer(k, beta * (v - hk)) - for (int idx = tid; idx < head_dim * head_dim; idx += n_threads) { - int i = idx / head_dim; - int j = idx % head_dim; - float delta_j = beta * (v[j] - hk_buf[j]); - h_out[idx] += k_norm[i] * delta_j; - } - __syncthreads(); - - // Step 5: Compute output y = h^T @ q - for (int j = tid; j < head_dim; j += n_threads) { - float sum = 0.0f; - for (int i = 0; i < head_dim; ++i) { - sum += h_out[i * head_dim + j] * q_norm[i]; - } - y[j] = sum; - } - __syncthreads(); - } -} - -void ggml_cuda_op_kda_scan(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - const ggml_tensor * src0 = dst->src[0]; // h - const ggml_tensor * src1 = dst->src[1]; // q - const ggml_tensor * src2 = dst->src[2]; // k - const ggml_tensor * src3 = dst->src[3]; // v - const ggml_tensor * src4 = dst->src[4]; // g - const ggml_tensor * src5 = dst->src[5]; // beta - const ggml_tensor * src6 = dst->src[6]; // ids - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT(src2->type == GGML_TYPE_F32); - GGML_ASSERT(src3->type == GGML_TYPE_F32); - GGML_ASSERT(src4->type == GGML_TYPE_F32); - GGML_ASSERT(src5->type == GGML_TYPE_F32); - GGML_ASSERT(src6->type == GGML_TYPE_I32); - - const int64_t head_dim = src0->ne[0]; - const int64_t n_head = src1->ne[1]; - const int64_t n_seq_tokens = src1->ne[2]; - const int64_t n_seqs = src1->ne[3]; - - // Output offset for hidden state (after attention output) - in floats - const int64_t y_off = ggml_nelements(src1); - - const float * h_d = (const float *)src0->data; - const float * q_d = (const float *)src1->data; - const float * k_d = (const float *)src2->data; - const float * v_d = (const float *)src3->data; - const float * g_d = (const float *)src4->data; - const float * beta_d = (const float *)src5->data; - const int32_t * ids_d = (const int32_t *)src6->data; - float * dst_d = (float *)dst->data; - - cudaStream_t stream = ctx.stream(); - - // Launch kernel: one block per (sequence, head) pair - const int n_blocks = n_seqs * n_head; - const int n_threads = 128; - - // Shared memory: hk_buf[head_dim] + q_norm[head_dim] + k_norm[head_dim] + warp_sums[64] - size_t smem_size = (3 * head_dim + 64) * sizeof(float); - - kda_scan_f32_kernel<<>>( - h_d, q_d, k_d, v_d, g_d, beta_d, ids_d, dst_d, - head_dim, n_head, n_seq_tokens, n_seqs, y_off); -} diff --git a/ggml/src/ggml-cuda/kda-scan.cuh b/ggml/src/ggml-cuda/kda-scan.cuh deleted file mode 100644 index 55783fb82b..0000000000 --- a/ggml/src/ggml-cuda/kda-scan.cuh +++ /dev/null @@ -1,3 +0,0 @@ -#include "common.cuh" - -void ggml_cuda_op_kda_scan(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 1703cad5ab..a167d6a574 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -5435,69 +5435,6 @@ struct ggml_tensor * ggml_ssm_scan( return result; } -// ggml_kda_scan - -struct ggml_tensor * ggml_kda_scan( - struct ggml_context * ctx, - struct ggml_tensor * h, - struct ggml_tensor * q, - struct ggml_tensor * k, - struct ggml_tensor * v, - struct ggml_tensor * g, - struct ggml_tensor * beta, - struct ggml_tensor * ids) { - GGML_ASSERT(ggml_is_contiguous(h)); - GGML_ASSERT(ggml_is_contiguous(q)); - GGML_ASSERT(ggml_is_contiguous(k)); - GGML_ASSERT(ggml_is_contiguous(v)); - GGML_ASSERT(ggml_is_contiguous(g)); - GGML_ASSERT(ggml_is_contiguous(beta)); - GGML_ASSERT(ids->type == GGML_TYPE_I32); - - { - const int64_t head_dim = h->ne[0]; - const int64_t n_head = q->ne[1]; - const int64_t n_seq_tokens = q->ne[2]; - const int64_t n_seqs = q->ne[3]; - - GGML_ASSERT(h->ne[0] == head_dim); - GGML_ASSERT(h->ne[1] == head_dim); - GGML_ASSERT(h->ne[2] == n_head); - GGML_ASSERT(q->ne[0] == head_dim); - GGML_ASSERT(k->ne[0] == head_dim); - GGML_ASSERT(v->ne[0] == head_dim); - GGML_ASSERT(g->ne[0] == head_dim); - GGML_ASSERT(ggml_are_same_shape(q, k)); - GGML_ASSERT(ggml_are_same_shape(q, v)); - GGML_ASSERT(ggml_are_same_shape(q, g)); - GGML_ASSERT(beta->ne[0] == n_head); - GGML_ASSERT(beta->ne[1] == n_seq_tokens); - GGML_ASSERT(beta->ne[2] == n_seqs); - GGML_ASSERT(ids->ne[0] == n_seqs); - GGML_ASSERT(ggml_is_vector(ids)); - } - - // Output: y (attention output) + updated hidden states - // y: {head_dim, n_head, n_seq_tokens, n_seqs} - // h_new: {head_dim, head_dim, n_head, n_seqs} - const int64_t head_dim = h->ne[0]; - const int64_t n_head = q->ne[1]; - const int64_t n_seqs = q->ne[3]; - struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, - ggml_nelements(q) + head_dim * head_dim * n_head * n_seqs); - - result->op = GGML_OP_KDA_SCAN; - result->src[0] = h; - result->src[1] = q; - result->src[2] = k; - result->src[3] = v; - result->src[4] = g; - result->src[5] = beta; - result->src[6] = ids; - - return result; -} - // ggml_win_part struct ggml_tensor * ggml_win_part( From 776294c04e506ce7d29ff1e0481ea3d371838668 Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Fri, 19 Dec 2025 07:36:06 +0800 Subject: [PATCH 17/58] removed all traces of kda_scan --- ggml/include/ggml.h | 23 ----------------------- ggml/src/ggml-cpu/ggml-cpu.c | 1 - ggml/src/ggml-cuda/ggml-cuda.cu | 5 ----- ggml/src/ggml.c | 1 - 4 files changed, 30 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 888f00c2e8..48da68fe7e 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -539,7 +539,6 @@ extern "C" { GGML_OP_FLASH_ATTN_BACK, GGML_OP_SSM_CONV, GGML_OP_SSM_SCAN, - GGML_OP_KDA_SCAN, GGML_OP_WIN_PART, GGML_OP_WIN_UNPART, GGML_OP_GET_REL_POS, @@ -2338,28 +2337,6 @@ extern "C" { struct ggml_tensor * C, struct ggml_tensor * ids); - // KDA (Kimi Delta Attention) scan - // Delta attention recurrence: - // h[t] = exp(g[t]) * h[t-1] + k[t]^T * (beta[t] * (v[t] - h[t-1] @ k[t])) - // o[t] = q[t]^T @ h[t] - // Parameters: - // h: hidden state {head_dim, head_dim, n_head, n_seqs+} - // q: query {head_dim, n_head, n_seq_tokens, n_seqs} - // k: key {head_dim, n_head, n_seq_tokens, n_seqs} - // v: value {head_dim, n_head, n_seq_tokens, n_seqs} - // g: gate {head_dim, n_head, n_seq_tokens, n_seqs} - // beta: mixing {n_head, n_seq_tokens, n_seqs} - // ids: seq indices {n_seqs} - GGML_API struct ggml_tensor * ggml_kda_scan( - struct ggml_context * ctx, - struct ggml_tensor * h, - struct ggml_tensor * q, - struct ggml_tensor * k, - struct ggml_tensor * v, - struct ggml_tensor * g, - struct ggml_tensor * beta, - struct ggml_tensor * ids); - // partition into non-overlapping windows with padding if needed // example: // a: 768 64 64 1 diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 4cc15b0981..3247af8bb0 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -2320,7 +2320,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { case GGML_OP_FLASH_ATTN_BACK: case GGML_OP_SSM_CONV: case GGML_OP_SSM_SCAN: - case GGML_OP_KDA_SCAN: case GGML_OP_RWKV_WKV6: case GGML_OP_GATED_LINEAR_ATTN: case GGML_OP_RWKV_WKV7: diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index bbd7810dd2..eb2e273110 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -4503,11 +4503,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g return op->src[0]->ne[0] == 16 && op->src[0]->ne[1] == 1 && op->src[0]->ne[2] % 128 == 0 && op->src[4]->ne[1] == 1; } } - case GGML_OP_KDA_SCAN: { - // KDA scan kernel supports head_dim 64 or 128 - const int64_t head_dim = op->src[0]->ne[0]; - return head_dim == 64 || head_dim == 128; - } case GGML_OP_SSM_CONV: { // assumes d_inner % threads == 0 return op->src[0]->ne[1] % 128 == 0; diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index a167d6a574..173ec6b98f 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -999,7 +999,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "FLASH_ATTN_BACK", "SSM_CONV", "SSM_SCAN", - "KDA_SCAN", "WIN_PART", "WIN_UNPART", "GET_REL_POS", From f67a42d57297c93b742bc4a85886e367ccabbd09 Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Fri, 19 Dec 2025 07:37:33 +0800 Subject: [PATCH 18/58] reduce OP count by 1 due to removal of kda_scan --- ggml/src/ggml.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 173ec6b98f..17cf4d84bb 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1024,7 +1024,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "GLU", }; -static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96"); +static_assert(GGML_OP_COUNT == 95, "GGML_OP_COUNT != 95"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1133,7 +1133,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "glu(x)", }; -static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96"); +static_assert(GGML_OP_COUNT == 95, "GGML_OP_COUNT != 95"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); From f85e5c73b91370d605f4835c2f6112fb729006cd Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Fri, 2 Jan 2026 21:20:34 +0800 Subject: [PATCH 19/58] Move KIMI_LINEAR to llm_arch_is_hybrid to enable KV cache --- src/llama-arch.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 6aabdb7f7d..cf5ea1177f 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -2863,7 +2863,6 @@ bool llm_arch_is_recurrent(const llm_arch & arch) { case LLM_ARCH_RWKV6QWEN2: case LLM_ARCH_RWKV7: case LLM_ARCH_ARWKV7: - case LLM_ARCH_KIMI_LINEAR: // KDA layers use delta attention with recurrent state return true; default: return false; @@ -2880,9 +2879,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) { case LLM_ARCH_LFM2MOE: case LLM_ARCH_NEMOTRON_H: case LLM_ARCH_QWEN3NEXT: - // Kimi: Currently using recurrent-only mode since MLA doesn't use KV cache - // TODO: Enable hybrid when MLA KV caching is implemented - // case LLM_ARCH_KIMI_LINEAR: + case LLM_ARCH_KIMI_LINEAR: return true; default: return false; From 8bd617eb1c1afc0b267e0e8d72c7db3bc801db7d Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Sat, 3 Jan 2026 08:26:41 +0800 Subject: [PATCH 20/58] set n_embd_head_k/v to ensure kv cache works --- convert_hf_to_gguf.py | 114 +++++++++++++++++++++--------------------- 1 file changed, 58 insertions(+), 56 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index c6724f2ed5..0a59dc2c5f 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4987,10 +4987,65 @@ class KimiLinearModel(TextModel): _experts: list[dict[str, Tensor]] | None = None + def set_vocab(self): + try: + self._set_vocab_gpt2() + return + except Exception: + pass + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + tokpre = self.get_vocab_base_pre(tokenizer) + + if tokpre == "kimi-k2": + # Build merges list using the approach similar to HunYuanMoE + merges = [] + vocab = {} + mergeable_ranks = tokenizer.model._mergeable_ranks + for token, rank in mergeable_ranks.items(): + vocab[QwenModel.token_bytes_to_string(token)] = rank + if len(token) == 1: + continue + merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) + if len(merged) == 2: + merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) + + # Build token list + vocab_size = self.hparams["vocab_size"] + special_tokens = tokenizer.special_tokens + reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()} + tokens: list[str] = [] + toktypes: list[int] = [] + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + else: + token = reverse_vocab[i] + tokens.append(token) + if i in special_tokens.values(): + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.NORMAL) + + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_token_merges(merges) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) + special_vocab.add_to_gguf(self.gguf_writer) + # override eos id in config.json with tiktoken eos id + self.gguf_writer.add_eos_token_id(tokenizer.eos_id) + else: + raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!") + def set_gguf_parameters(self): super().set_gguf_parameters() self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) - self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"]) self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) # Use find_hparam for context length @@ -5043,8 +5098,9 @@ class KimiLinearModel(TextModel): # Support HuggingFace naming: qk_nope_head_dim, qk_rope_head_dim, v_head_dim qk_nope_head_dim = self.hparams.get("qk_nope_head_dim") qk_rope_head_dim = self.hparams.get("qk_rope_head_dim") + self.gguf_writer.add_key_length(qk_nope_head_dim + qk_rope_head_dim) v_head_dim = self.hparams.get("v_head_dim") - self.gguf_writer.add_rope_dimension_count(self.hparams["qk_rope_head_dim"]) + self.gguf_writer.add_value_length(v_head_dim) # Calculate n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim if "n_embd_head_k_mla" in self.hparams: @@ -5106,60 +5162,6 @@ class KimiLinearModel(TextModel): if routed_scaling_factor is not None: self.gguf_writer.add_expert_weights_scale(routed_scaling_factor) - def set_vocab(self): - try: - self._set_vocab_gpt2() - return - except Exception: - pass - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) - tokpre = self.get_vocab_base_pre(tokenizer) - - if tokpre == "kimi-k2": - # Build merges list using the approach similar to HunYuanMoE - merges = [] - vocab = {} - mergeable_ranks = tokenizer.model._mergeable_ranks - for token, rank in mergeable_ranks.items(): - vocab[QwenModel.token_bytes_to_string(token)] = rank - if len(token) == 1: - continue - merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) - if len(merged) == 2: - merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) - - # Build token list - vocab_size = self.hparams["vocab_size"] - special_tokens = tokenizer.special_tokens - reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()} - tokens: list[str] = [] - toktypes: list[int] = [] - - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.UNUSED) - else: - token = reverse_vocab[i] - tokens.append(token) - if i in special_tokens.values(): - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.NORMAL) - - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - self.gguf_writer.add_token_merges(merges) - - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) - special_vocab.add_to_gguf(self.gguf_writer) - else: - raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!") - def prepare_tensors(self): super().prepare_tensors() if self._experts is not None: From a4020d867f55bf4721ddd1c5df713f4ade14ad49 Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Sat, 3 Jan 2026 08:27:29 +0800 Subject: [PATCH 21/58] don't quantize conv1d of Kimi Linear --- src/llama-quant.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 7b8bf6e524..bae907f92c 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -869,9 +869,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight"); quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight"); - // do not quantize Mamba's small yet 2D weights + // do not quantize Mamba /Kimi's small conv1d weights // NOTE: can't use LLM_TN here because the layer number is not known - quantize &= name.find("ssm_conv1d.weight") == std::string::npos; + quantize &= name.find("ssm_conv1d") == std::string::npos; quantize &= name.find("shortconv.conv.weight") == std::string::npos; // do not quantize RWKV's small yet 2D weights From 66c0c5d8d400296357b8a4df4438c0acd74c3a05 Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Mon, 5 Jan 2026 16:35:19 +0800 Subject: [PATCH 22/58] Kimi Linear backend agnostic --- src/models/kimi-linear.cpp | 493 ++++++++++++++++++++++++++++++++----- src/models/models.h | 21 ++ 2 files changed, 450 insertions(+), 64 deletions(-) diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index d025eab5f3..32a723b80a 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -1,24 +1,35 @@ #include "models.h" +#include "ggml.h" +#include "llama-impl.h" + +#define CHUNK_SIZE 64 llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params), model(model) { ggml_tensor * cur; ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); - + cb(inpL, "model.embed_tokens", -1); + // Note: Kimi MLA does NOT use RoPE (rotary_emb=None in vLLM) // So we don't need inp_pos - // Only use recurrent state input for KDA layers - // MLA layers use direct softmax attention without KV cache - auto * inp_rs = build_rs_inp(); - - // Input for MLA layers (no KV cache) - auto * inp_no_cache = build_attn_inp_no_cache(); + auto * inp = build_inp_mem_hybrid(); + auto * inp_rs = inp->get_recr(); + auto * inp_attn = inp->get_attn(); // Output ids for selecting which tokens to output ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * causal_mask = + ggml_tri(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, ubatch.n_seq_tokens, ubatch.n_seq_tokens), 1.0f), + GGML_TRI_TYPE_LOWER); + + ggml_tensor * identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, ubatch.n_seq_tokens), 1.0f)); + + ggml_build_forward_expand(gf, causal_mask); + ggml_build_forward_expand(gf, identity); + // Kimi dimension constants const int64_t n_head = hparams.n_head(); const int64_t head_dim = hparams.kda_head_dim; @@ -40,10 +51,6 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll // Confirmed from tensor shape: wkv_a_mqa [2304, 576] = [n_embd, kv_lora_rank + qk_rope_head_dim] const int64_t n_embd_head_qk_rope = hparams.n_rot; // config.qk_rope_head_dim const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope; // 192 - 64 = 128 - - // Attention scale for KDA (1/sqrt(head_dim)) - const float kq_scale_kda = 1.0f / sqrtf((float)head_dim); - // Attention scale for MLA const float kq_scale_mla = 1.0f / sqrtf((float)n_embd_head_k_mla); @@ -51,6 +58,8 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll const auto & layer = model.layers[il]; ggml_tensor * inpSA = inpL; + if (!layer.attn_norm) + LLAMA_LOG_INFO("Empty attn_norm at layer %d\n", il); // Attention Norm cur = build_norm(inpL, layer.attn_norm, NULL, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -69,6 +78,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll // Get conv states from r_l tensor (Q, K, V each have separate state) ggml_tensor * conv_states_all = mctx_cur->get_r_l(il); + cb(conv_states_all, "conv_states_all", il); const int64_t conv_state_size = (d_conv - 1) * d_inner; const int64_t n_embd_r_total = 3 * conv_state_size; // Q + K + V ggml_tensor * conv_state_all = build_rs(inp_rs, conv_states_all, hparams.n_embd_r(), n_seqs); @@ -143,12 +153,14 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll // ggml_ssm_conv output: {d_inner, n_seq_tokens, n_seqs} Qcur = ggml_ssm_conv(ctx0, conv_q, conv_weight); + cb(Qcur, "Q conv1d", il); // Reshape to 2D for bias add: {d_inner, n_tokens} Qcur = ggml_reshape_2d(ctx0, Qcur, d_inner, n_tokens); if (layer.ssm_q_conv_b) { Qcur = ggml_add(ctx0, Qcur, layer.ssm_q_conv_b); } Qcur = ggml_silu(ctx0, Qcur); + cb(Qcur, "Q conv1d b", il); } else { GGML_ABORT("KDA layer missing Q conv weight"); } @@ -173,11 +185,13 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll } ggml_tensor * k_conv_weight = ggml_reshape_2d(ctx0, k_conv_f32, d_conv, d_inner); Kcur = ggml_ssm_conv(ctx0, conv_k, k_conv_weight); + cb(Kcur, "K conv1d", il); Kcur = ggml_reshape_2d(ctx0, Kcur, d_inner, n_tokens); if (layer.ssm_k_conv_b) { Kcur = ggml_add(ctx0, Kcur, layer.ssm_k_conv_b); } Kcur = ggml_silu(ctx0, Kcur); + cb(Kcur, "K conv1d b", il); } else { GGML_ABORT("KDA layer missing K conv weight"); } @@ -202,11 +216,13 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll } ggml_tensor * v_conv_weight = ggml_reshape_2d(ctx0, v_conv_f32, d_conv, d_inner); Vcur = ggml_ssm_conv(ctx0, conv_v, v_conv_weight); + cb(Vcur, "V conv1d", il); Vcur = ggml_reshape_2d(ctx0, Vcur, d_inner, n_tokens); if (layer.ssm_v_conv_b) { Vcur = ggml_add(ctx0, Vcur, layer.ssm_v_conv_b); } Vcur = ggml_silu(ctx0, Vcur); + cb(Vcur, "V conv1d b", il); } else { GGML_ABORT("KDA layer missing V conv weight"); } @@ -215,6 +231,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll // g1 = -exp(A_log) * softplus(f_b(f_a(x)) + dt_bias) ggml_tensor * f_a = ggml_mul_mat(ctx0, layer.ssm_f_a, cur); ggml_tensor * g1 = ggml_mul_mat(ctx0, layer.ssm_f_b, f_a); + cb(g1, "g1 f_b(f_a(cur))", il); g1 = ggml_add(ctx0, g1, layer.ssm_dt_b); g1 = ggml_softplus(ctx0, g1); g1 = ggml_reshape_3d(ctx0, g1, head_dim, n_head, n_tokens); @@ -229,7 +246,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll // Step 4: Compute beta (mixing coefficient) ggml_tensor * beta = ggml_mul_mat(ctx0, layer.ssm_beta, cur); - beta = ggml_sigmoid(ctx0, beta); + beta = ggml_cont_4d(ctx0, beta, n_head, 1, n_seq_tokens, n_seqs); cb(beta, "kda_beta", il); // Step 5: Reshape for KDA recurrence @@ -240,49 +257,56 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll Kcur = ggml_cont(ctx0, ggml_reshape_4d(ctx0, Kcur, head_dim, n_head, n_seq_tokens, n_seqs)); Vcur = ggml_cont(ctx0, ggml_reshape_4d(ctx0, Vcur, head_dim, n_head, n_seq_tokens, n_seqs)); g1 = ggml_cont(ctx0, ggml_reshape_4d(ctx0, g1, head_dim, n_head, n_seq_tokens, n_seqs)); - beta = ggml_cont(ctx0, ggml_reshape_3d(ctx0, beta, n_head, n_seq_tokens, n_seqs)); - cb(Qcur, "kda_Q", il); cb(Kcur, "kda_K", il); cb(Vcur, "kda_V", il); - + // Step 6: Get SSM state and compute KDA recurrence using ggml_kda_scan ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il); - - // Use build_rs with lambda pattern (like Mamba SSM scan) - auto get_kda_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) { - ggml_tensor * h_state = ggml_reshape_4d(ctx, states, head_dim, head_dim, n_head, mctx_cur->get_size()); - // Call ggml_kda_scan which implements the correct KDA recurrence - return ggml_kda_scan(ctx, h_state, Qcur, Kcur, Vcur, g1, beta, ids); - }; - - ggml_tensor * y_kda = build_rs(inp_rs, ssm_states_all, hparams.n_embd_s(), n_seqs, get_kda_rows); - cb(y_kda, "kda_scan_out", il); - - // Store updated state back - // y_kda contains: [attention_output (head_dim * n_head * n_seq_tokens * n_seqs), new_state (head_dim * head_dim * n_head * n_seqs)] - const int64_t attn_out_size = head_dim * n_head * n_seq_tokens * n_seqs; - const int64_t state_size = head_dim * head_dim * n_head; - ggml_build_forward_expand(gf, - ggml_cpy(ctx0, - ggml_view_1d(ctx0, y_kda, state_size * n_seqs, attn_out_size * ggml_element_size(y_kda)), - ggml_view_1d(ctx0, ssm_states_all, state_size * n_seqs, kv_head * state_size * ggml_element_size(ssm_states_all)))); - - // Extract attention output - ggml_tensor * attn_out = ggml_view_1d(ctx0, y_kda, attn_out_size, 0); - attn_out = ggml_reshape_3d(ctx0, attn_out, head_dim, n_head, n_seq_tokens * n_seqs); - cb(attn_out, "kda_attn_out", il); - + ggml_tensor * state = build_rs(inp_rs, ssm_states_all, hparams.n_embd_s(), n_seqs); + state = ggml_reshape_4d(ctx0, state, head_dim, head_dim, n_head, n_seqs); + // Choose between build_kda_chunking and build_kda_recurrent based on n_tokens + // TODO: Currently only build_kda_recurrent is implemented + ggml_tensor * attn_out = n_seq_tokens > CHUNK_SIZE ? + build_kda_recurrent(Qcur, Kcur, Vcur, g1, beta, state, causal_mask, identity, il) : + build_kda_recurrent(Qcur, Kcur, Vcur, g1, beta, state, causal_mask, identity, il); + cb(attn_out, "attn_out", il); + + // The tensors were concatenated 1d, so we need to extract them 1d as well + const int64_t output_flat_size = head_dim * n_head * n_seq_tokens * n_seqs; + ggml_tensor * attn_out_1d = ggml_view_1d(ctx0, attn_out, output_flat_size, 0); + cb(attn_out_1d, "attn_out_1d", il); + + ggml_tensor * attn_out_final = ggml_reshape_3d(ctx0, attn_out_1d, head_dim, n_head, n_seq_tokens * n_seqs); + cb(attn_out_final, "attn_out_reshaped", il); + // Extract the state part (second part of the concatenated tensor) + // State starts after n_tokens elements along dimension 1 + const int64_t state_flat_size = head_dim * head_dim * n_head * n_seqs; + + ggml_tensor * state_1d = + ggml_view_1d(ctx0, attn_out, state_flat_size, output_flat_size * ggml_element_size(attn_out)); + cb(state_1d, "state_1d", il); + + // Update the recurrent states + ggml_build_forward_expand(gf, + ggml_cpy(ctx0, state_1d, + ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs, + kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all)))); + + GGML_ASSERT(ggml_nelements(attn_out_1d) + ggml_nelements(state_1d) == ggml_nelements(attn_out)); + // Step 7: Output gating g2 = g_b(g_a(x)) ggml_tensor * cur_2d = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs); ggml_tensor * g_a = ggml_mul_mat(ctx0, layer.ssm_g_a, cur_2d); ggml_tensor * g2 = ggml_mul_mat(ctx0, layer.ssm_g_b, g_a); + cb(g2, "g2 g_b(g_a(cur_2d))", il); g2 = ggml_reshape_3d(ctx0, g2, head_dim, n_head, n_seq_tokens * n_seqs); // Step 8: Apply o_norm with sigmoid gating // Note: Kimi model uses sigmoid gating, not SiLU (despite FusedRMSNormGated default being swish) // Formula: output = RMSNorm(x) * sigmoid(g) - ggml_tensor * normed = build_norm(attn_out, layer.ssm_o_norm, layer.ssm_o_norm_b, LLM_NORM_RMS, il); + ggml_tensor * normed = build_norm(attn_out_final, layer.ssm_o_norm, layer.ssm_o_norm_b, LLM_NORM_RMS, il); + cb(normed, "kda_normed", il); ggml_tensor * gate = ggml_sigmoid(ctx0, g2); ggml_tensor * gated = ggml_mul(ctx0, normed, gate); @@ -290,11 +314,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll gated = ggml_cont_2d(ctx0, gated, d_inner, n_tokens); cur = ggml_mul_mat(ctx0, layer.wo, gated); cb(cur, "kda_out", il); - - - GGML_UNUSED(d_conv); - GGML_UNUSED(kq_scale_kda); - + } else if (is_mla) { // === MLA Layer (Multi-head Latent Attention) without KV Cache === // Reference: vLLM mla.py @@ -308,25 +328,25 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll cb(Qcur, "mla_Q", il); // Step 2: KV compression - // kv_lora = kv_a_proj_with_mqa(hidden_states) -> [kv_lora_rank + qk_rope_head_dim, n_tokens] - ggml_tensor * kv_lora = ggml_mul_mat(ctx0, layer.wkv_a_mqa, cur); + // kv_cmpr_pe = kv_a_proj_with_mqa(hidden_states) -> [kv_lora_rank + qk_rope_head_dim, n_tokens] + ggml_tensor * kv_cmpr_pe = ggml_mul_mat(ctx0, layer.wkv_a_mqa, cur); - // Split: kv_c = kv_lora[:kv_lora_rank], k_pe = kv_lora[kv_lora_rank:] - ggml_tensor * kv_c = ggml_view_2d(ctx0, kv_lora, kv_lora_rank, n_tokens, - ggml_row_size(kv_lora->type, kv_lora_rank + n_embd_head_qk_rope), 0); - ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_lora, n_embd_head_qk_rope, 1, n_tokens, - ggml_row_size(kv_lora->type, kv_lora_rank + n_embd_head_qk_rope), - ggml_row_size(kv_lora->type, kv_lora_rank + n_embd_head_qk_rope), - ggml_row_size(kv_lora->type, kv_lora_rank)); + // Split: kv_cmpr = kv_lora[:kv_lora_rank], k_pe = kv_lora[kv_lora_rank:] + ggml_tensor * kv_cmpr = ggml_view_2d(ctx0, kv_cmpr_pe, kv_lora_rank, n_tokens, + ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope), 0); + ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_cmpr_pe, n_embd_head_qk_rope, 1, n_tokens, + ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope), + ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope), + ggml_row_size(kv_cmpr_pe->type, kv_lora_rank)); // Note: Kimi MLA does NOT apply RoPE (rotary_emb=None in vLLM) // k_pe is used directly without RoPE // Normalize kv_c - kv_c = build_norm(kv_c, layer.attn_kv_a_norm, nullptr, LLM_NORM_RMS, il); + kv_cmpr = build_norm(kv_cmpr, layer.attn_kv_a_norm, nullptr, LLM_NORM_RMS, il); // KV decompression: kv = kv_b_proj(kv_c_normed) - ggml_tensor * kv = ggml_mul_mat(ctx0, layer.wkv_b, kv_c); + ggml_tensor * kv = ggml_mul_mat(ctx0, layer.wkv_b, kv_cmpr); const int64_t kv_per_head = n_embd_head_qk_nope + n_embd_head_v_mla; // Split kv into k_nope and v @@ -344,17 +364,16 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll // Concatenate k_nope + k_pe (broadcast k_pe to all heads) // K = [k_nope, k_pe] where k_nope is [qk_nope_head_dim, n_head, n_tokens] // and k_pe is [qk_rope_head_dim, 1, n_tokens] broadcast to all heads - k_pe = ggml_cont(ctx0, k_pe); // Need to broadcast k_pe from [qk_rope, 1, n_tokens] to [qk_rope, n_head, n_tokens] ggml_tensor * k_pe_target = ggml_new_tensor_3d(ctx0, k_pe->type, n_embd_head_qk_rope, n_head, n_tokens); ggml_tensor * k_pe_repeated = ggml_repeat(ctx0, k_pe, k_pe_target); ggml_tensor * Kcur = ggml_concat(ctx0, k_nope, k_pe_repeated, 0); cb(Kcur, "mla_K", il); - // Direct softmax attention (without KV cache) - // Use build_attn with inp_no_cache for proper mask handling - cur = build_attn(inp_no_cache, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il); -// cb(cur, "mla_out", il); + // Direct softmax attention (with KV cache) + // Use build_attn with inp_attn for proper mask handling + cur = build_attn(inp_attn, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il); + cb(cur, "mla_out", il); } else { // Unknown layer type - this should not happen @@ -435,6 +454,352 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll res->t_logits = cur; ggml_build_forward_expand(gf, cur); - - GGML_UNUSED(n_embd_head_qk_nope); } + +/* + IMPORTANT: Currently build_kda_chunking is not implemented nor called +*/ +ggml_tensor * llm_build_kimi_linear::build_kda_chunking( + ggml_tensor * q, + ggml_tensor * k, + ggml_tensor * v, + ggml_tensor * gk, + ggml_tensor * beta, + ggml_tensor * state, + ggml_tensor * causal_mask, + ggml_tensor * identity, + int il) { + GGML_ASSERT(ggml_is_contiguous(q)); + GGML_ASSERT(ggml_is_contiguous(k)); + GGML_ASSERT(ggml_is_contiguous(v)); + GGML_ASSERT(ggml_is_contiguous(gk)); + GGML_ASSERT(ggml_is_contiguous(beta)); + GGML_ASSERT(ggml_is_contiguous(state)); + + const int64_t S_k = q->ne[0]; + const int64_t H_k = q->ne[1]; + const int64_t n_tokens = q->ne[2]; + const int64_t n_seqs = q->ne[3]; + + const int64_t S_v = v->ne[0]; + const int64_t H_v = v->ne[1]; + + GGML_ASSERT(v->ne[2] == n_tokens); + GGML_ASSERT(k->ne[2] == n_tokens); + GGML_ASSERT(gk->ne[0] == S_v && gk->ne[1] == H_v && gk->ne[2] == n_tokens && gk->ne[3] == n_seqs); + GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs); + GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v * H_v && state->ne[2] == 1 && state->ne[3] == n_seqs); + + GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs); + GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs); + + GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case + + // TODO: can this ever be false? + const bool use_qk_l2norm = true; + + if (use_qk_l2norm) { + const float eps_norm = hparams.f_norm_rms_eps; + + q = ggml_l2_norm(ctx0, q, eps_norm); + k = ggml_l2_norm(ctx0, k, eps_norm); + } + + const float scale = 1.0f / sqrtf(S_v); + + q = ggml_scale(ctx0, q, scale); + + beta = ggml_sigmoid(ctx0, beta); + + cb(q, "q_in", il); + cb(k, "k_in", il); + cb(v, "v_in", il); + cb(beta, "beta_in", il); + cb(gk, "gk_in", il); + + q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs); + k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs); + v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs); + gk = ggml_cont_4d(ctx0, ggml_permute(ctx0, gk, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs); + + beta = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3)); + state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs); + + ggml_tensor * causal_diag_mask = ggml_add(ctx0, causal_mask, identity); + + cb(q, "q_perm", il); + cb(k, "k_perm", il); + cb(v, "v_perm", il); + cb(beta, "beta_perm", il); + cb(gk, "gk_perm", il); + cb(state, "state_in", il); + cb(causal_diag_mask, "causal_diag_mask", il); + + GGML_ASSERT(q->ne[1] == n_tokens && q->ne[0] == S_k && q->ne[2] == H_k && q->ne[3] == n_seqs); + GGML_ASSERT(k->ne[1] == n_tokens && k->ne[0] == S_k && k->ne[2] == H_k && k->ne[3] == n_seqs); + GGML_ASSERT(v->ne[1] == n_tokens && v->ne[0] == S_v && v->ne[2] == H_k && v->ne[3] == n_seqs); + GGML_ASSERT(beta->ne[1] == n_tokens && beta->ne[2] == H_k && beta->ne[0] == 1 && beta->ne[3] == n_seqs); + + ggml_tensor * v_beta = ggml_mul(ctx0, v, beta); + ggml_tensor * k_beta = ggml_mul(ctx0, k, beta); + + cb(k_beta, "k_beta", il); + cb(v_beta, "v_beta", il); + + return nullptr; +} + +ggml_tensor * llm_build_kimi_linear::build_kda_recurrent( + ggml_tensor * q, + ggml_tensor * k, + ggml_tensor * v, + ggml_tensor * gk, + ggml_tensor * beta, + ggml_tensor * state, + ggml_tensor * causal_mask, + ggml_tensor * identity, + int il) { + GGML_ASSERT(ggml_is_contiguous(q)); + GGML_ASSERT(ggml_is_contiguous(k)); + GGML_ASSERT(ggml_is_contiguous(v)); + GGML_ASSERT(ggml_is_contiguous(gk)); + GGML_ASSERT(ggml_is_contiguous(beta)); + GGML_ASSERT(ggml_is_contiguous(state)); + + const int64_t S_k = q->ne[0]; + const int64_t H_k = q->ne[1]; + const int64_t n_tokens = q->ne[2]; + const int64_t n_seqs = q->ne[3]; + + const int64_t S_v = v->ne[0]; + const int64_t H_v = v->ne[1]; + + GGML_ASSERT(v->ne[2] == n_tokens); + GGML_ASSERT(k->ne[2] == n_tokens); + GGML_ASSERT(gk->ne[0] == S_k && gk->ne[1] == H_v && gk->ne[2] == n_tokens && gk->ne[3] == n_seqs); + GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs); + GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v && state->ne[2] == H_v && state->ne[3] == n_seqs); + + GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs); + GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs); + + GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case + + // TODO: can this ever be false? + const bool use_qk_l2norm = true; + + if (use_qk_l2norm) { + const float eps_norm = hparams.f_norm_rms_eps; + + q = ggml_l2_norm(ctx0, q, eps_norm); + k = ggml_l2_norm(ctx0, k, eps_norm); + } + + const float scale = 1.0f / sqrtf(S_v); + + beta = ggml_sigmoid(ctx0, beta); + + ggml_tensor * causal_diag_mask = ggml_add(ctx0, causal_mask, identity); + + cb(q, "q_in", il); + cb(k, "k_in", il); + cb(v, "v_in", il); + cb(beta, "beta_in", il); + cb(gk, "gk_in", il); + + q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs); + k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs); + v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs); + gk = ggml_cont_4d(ctx0, ggml_permute(ctx0, gk, 1, 2, 0, 3), n_tokens, S_k, H_k, n_seqs); + + beta = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3)); + state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs); + + cb(q, "q_perm", il); + cb(k, "k_perm", il); + cb(v, "v_perm", il); + cb(beta, "beta_perm", il); + cb(gk, "gk_perm", il); + cb(state, "state_in", il); + + GGML_ASSERT(q->ne[1] == n_tokens && q->ne[0] == S_k && q->ne[2] == H_k && q->ne[3] == n_seqs); + GGML_ASSERT(k->ne[1] == n_tokens && k->ne[0] == S_k && k->ne[2] == H_k && k->ne[3] == n_seqs); + GGML_ASSERT(v->ne[1] == n_tokens && v->ne[0] == S_v && v->ne[2] == H_k && v->ne[3] == n_seqs); + GGML_ASSERT(beta->ne[1] == n_tokens && beta->ne[2] == H_k && beta->ne[0] == 1 && beta->ne[3] == n_seqs); + + // ========================================================================= + // Compute cumulative sum of gk per key dimension + // gk_cumsum: [S_k, n_tokens, H_k, n_seqs] - cumsum along dim 1 (tokens) + // ========================================================================= + ggml_tensor * gk_cumsum = ggml_cumsum(ctx0, gk); + cb(gk_cumsum, "gk_cumsum", il); + + // Scale k and k_beta + + ggml_tensor * k_beta = ggml_mul(ctx0, k, beta); + ggml_tensor * v_beta = ggml_mul(ctx0, v, beta); + + cb(k_beta, "k_beta", il); + cb(v_beta, "v_beta", il); + + +/* + https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/kda/naive.py + + for i in range(T): + k_i = k[..., i, :] + g_i = g[..., i:i+1, :] + A[..., i] = torch.einsum('... c d, ... d -> ... c', k * (g - g_i).exp(), k_i) +*/ + const int64_t HB = H_k * n_seqs; + ggml_tensor * k_per = ggml_cont(ctx0, ggml_permute(ctx0, k, 1, 0, 2, 3)); + ggml_tensor * k_i = ggml_reshape_4d(ctx0, k_per, n_tokens, 1, S_k, HB); + ggml_tensor * k_i_bc = ggml_repeat_4d(ctx0, k_i, n_tokens, n_tokens, S_k, HB); + ggml_tensor * g_i = ggml_reshape_4d(ctx0, gk_cumsum, n_tokens, 1, S_k, HB); + ggml_tensor * g_i_bc = ggml_repeat_4d(ctx0, g_i, n_tokens, n_tokens, S_k, HB); // [S_k, chunk_size, 1, HB] -> [S_k, chunk_size, chunk_size, HB] + + ggml_tensor * k_j = ggml_reshape_4d(ctx0, k_per, 1, n_tokens, S_k, HB); + ggml_tensor * k_j_bc = ggml_repeat_4d(ctx0, k_j, n_tokens, n_tokens, S_k, HB); + + ggml_tensor * g_j = ggml_reshape_4d(ctx0, gk_cumsum, 1, n_tokens, S_k, HB); + ggml_tensor * g_j_bc = ggml_repeat_4d(ctx0, g_j, n_tokens, n_tokens, S_k, HB); // [S_k, 1, chunk_size, HB] -> [S_k, chunk_size, chunk_size, HB] + + ggml_tensor * decay_mask = ggml_sub(ctx0, g_j_bc, g_i_bc); + cb(decay_mask, "decay_mask", il); + decay_mask = ggml_mul(ctx0, decay_mask, causal_diag_mask); + decay_mask = ggml_exp(ctx0, decay_mask); + decay_mask = ggml_mul(ctx0, decay_mask, causal_diag_mask); + cb(decay_mask, "decay_mask_exp", il); + + ggml_tensor * Akk = ggml_mul(ctx0, decay_mask, k_j_bc); + Akk = ggml_mul(ctx0, Akk, k_i_bc); + + Akk = ggml_cont(ctx0, ggml_permute(ctx0, Akk, 1, 2, 0, 3)); + Akk = ggml_sum_rows(ctx0, Akk); + + Akk = ggml_reshape_4d(ctx0, Akk, n_tokens, n_tokens, H_k, n_seqs); + + Akk = ggml_mul(ctx0, Akk, beta); + Akk = ggml_neg(ctx0, ggml_mul(ctx0, Akk, causal_mask)); + + cb(Akk, "attn_pre_rec", il); + + // for i in range(1, chunk_size): + // row = attn[..., i, :i].clone() + // sub = attn[..., :i, :i].clone() + // attn[..., i, :i] = row + (row.unsqueeze(-1) * sub).sum(-2) + // attn = attn + torch.eye(chunk_size, dtype=attn.dtype, device=attn.device) + // + // We reduce this to a linear triangular solve: AX = B, where B = attn, A = I - tril(A) + ggml_tensor * attn_lower = ggml_mul(ctx0, Akk, causal_mask); + ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower); + + ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, Akk, true, true, false); + Akk = ggml_mul(ctx0, lin_solve, causal_mask); + Akk = ggml_add(ctx0, Akk, identity); + + gk_cumsum = ggml_cont(ctx0, ggml_permute(ctx0, gk_cumsum, 1, 0, 2, 3)); // back to [S_k, n_tokens, H_k, n_seqs] + + // u = (A*beta[..., None, :]) @ v aka U_[t] + ggml_tensor * vb = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), Akk); + cb(vb, "value_beta", il); + + // k_cumdecay = attn @ (k_beta * g.exp().unsqueeze(-1)) or W_[t] + ggml_tensor * gkexp = ggml_exp(ctx0, gk_cumsum); // [S,T,H,B] + + ggml_tensor * kbeta_gkexp = ggml_mul(ctx0, k_beta, gkexp); + cb(kbeta_gkexp, "kbeta_gkexp", il); + + ggml_tensor * k_cumdecay = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gkexp)), Akk); + cb(k_cumdecay, "k_cumdecay", il); + +/* + https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/kda/naive.py + + for j in range(BT): + k_j = k[:, :, i, j] + g_j = g[:, :, i, j:j+1, :] + A[..., j] = torch.einsum('... c d, ... d -> ... c', q_i * (g_i - g_j).exp(), k_j) +*/ + ggml_tensor * q_per = ggml_cont(ctx0, ggml_permute(ctx0, q, 1, 0, 2, 3)); + ggml_tensor * q_j = ggml_reshape_4d(ctx0, q_per, 1, n_tokens, S_k, HB); + ggml_tensor * q_j_bc = ggml_repeat_4d(ctx0, q_j, n_tokens, n_tokens, S_k, HB); + ggml_tensor * kq = ggml_mul(ctx0, decay_mask, q_j_bc); + kq = ggml_mul(ctx0, kq, k_i_bc); + kq = ggml_cont(ctx0, ggml_permute(ctx0, kq, 1, 2, 0, 3)); + + ggml_tensor * Aqk = ggml_sum_rows(ctx0, kq); + Aqk = ggml_cont(ctx0, ggml_reshape_4d(ctx0, Aqk, n_tokens, n_tokens, H_k, n_seqs)); + Aqk = ggml_mul(ctx0, Aqk, ggml_add(ctx0, identity, causal_mask)); + Aqk = ggml_scale(ctx0, Aqk, scale); // scale q + cb(Aqk, "attn_decay_key", il); + + ggml_tensor * state_t = ggml_cont(ctx0, ggml_transpose(ctx0, state)); + + // v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state or W_[t] @ S_[t] + ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay); + + cb(v_prime, "v_prime", il); + + // v_new = v_i - v_prime or U_[t] - W_[t]*S_[t] + ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, vb, v_prime), v_prime); + + // v_new_t [T.S.H,B] + ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new)); + + cb(v_new, "v_new", il); + + // attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state + // or Gamma_[t]*Q_]t] @ S + ggml_tensor * q_gk_exp = ggml_mul(ctx0, q, gkexp); + ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_gk_exp); + // scale q at attn_inter as suggested in chunk_gla_fwd_kernel_o of + // github.com/fla-org/flash-linear-attention/fla/ops/gla/chunk.py + attn_inter = ggml_scale(ctx0, attn_inter, scale); // scale q + + cb(attn_inter, "attn_inter", il); + + // core_attn_out[:, :, i] = attn_inter + attn @ v_new or A' @ (U_[t] - W_[t]*S_[t]) + ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, Aqk); + + cb(v_attn, "v_attn", il); + + // o[:, :, i] = (q_i * g_i.exp()) @ S + A @ v_i + ggml_tensor * core_attn_out = ggml_add(ctx0, attn_inter, v_attn); + + cb(core_attn_out, "core_attn_out", il); + + ggml_tensor * gk_cum_last = + ggml_cont(ctx0, ggml_view_4d(ctx0, gk_cumsum, gk_cumsum->ne[0], 1, gk_cumsum->ne[2], gk_cumsum->ne[3], + gk_cumsum->nb[1], gk_cumsum->nb[2], gk_cumsum->nb[3], + gk_cumsum->nb[1] * (gk_cumsum->ne[1] - 1))); + cb(gk_cum_last, "gk_cum_last", il); + + ggml_tensor * gkexp_last = ggml_exp(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, gk_cum_last))); + cb(gkexp_last, "gkexp_last", il); + + ggml_tensor * gk_diff = ggml_neg(ctx0, ggml_sub(ctx0, gk_cumsum, gk_cum_last)); + cb(gk_diff, "gk_diff", il); + + ggml_tensor * gk_diff_exp = ggml_exp(ctx0, gk_diff); + cb(gk_diff_exp, "gk_diff_exp", il); + + ggml_tensor * key_gkdiff = ggml_mul(ctx0, k, gk_diff_exp); + cb(key_gkdiff, "key_gkdiff", il); + + // rearrange((g_i[:,:,-1:] - g_i).exp()*k_i, 'b h c k -> b h k c') @ (U_[t] - W_[t] @ S) + ggml_tensor * kgkdmulvnew = ggml_mul_mat(ctx0, v_new_t, ggml_cont(ctx0, ggml_transpose(ctx0, key_gkdiff))); + cb(kgkdmulvnew, "kgkdmulvnew", il); + + state = ggml_add(ctx0, ggml_mul(ctx0, state, gkexp_last), kgkdmulvnew); + cb(state, "new_state", il); + + // flatten output + ggml_tensor * flat_output = + ggml_cont_1d(ctx0, ggml_permute(ctx0, core_attn_out, 0, 2, 1, 3), S_v * H_v * n_tokens * n_seqs); + + ggml_tensor * flat_state = ggml_cont_1d(ctx0, state, S_v * S_v * H_v * n_seqs); + + return ggml_concat(ctx0, flat_output, flat_state, 0); +} + diff --git a/src/models/models.h b/src/models/models.h index 8b7af8d7bc..bec9d39391 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -287,6 +287,27 @@ struct llm_build_kimi_linear : public llm_graph_context_mamba { llm_build_kimi_linear(const llama_model & model, const llm_graph_params & params); private: const llama_model & model; + ggml_tensor * build_kda_recurrent( + ggml_tensor * q, + ggml_tensor * k, + ggml_tensor * v, + ggml_tensor * g, + ggml_tensor * beta, + ggml_tensor * state, + ggml_tensor * causal_mask, + ggml_tensor * identity, + int il); + + ggml_tensor * build_kda_chunking( + ggml_tensor * q, + ggml_tensor * k, + ggml_tensor * v, + ggml_tensor * g, + ggml_tensor * beta, + ggml_tensor * state, + ggml_tensor * causal_mask, + ggml_tensor * identity, + int il); }; struct llm_build_lfm2 : public llm_graph_context { From aba181ebadd1c860201eff4fe702a89b3c9b8a1c Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Mon, 5 Jan 2026 19:21:06 +0800 Subject: [PATCH 23/58] removed LOG_INFO --- src/models/kimi-linear.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index 32a723b80a..a943dd1dce 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -1,6 +1,5 @@ #include "models.h" #include "ggml.h" -#include "llama-impl.h" #define CHUNK_SIZE 64 @@ -58,8 +57,6 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll const auto & layer = model.layers[il]; ggml_tensor * inpSA = inpL; - if (!layer.attn_norm) - LLAMA_LOG_INFO("Empty attn_norm at layer %d\n", il); // Attention Norm cur = build_norm(inpL, layer.attn_norm, NULL, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); From cfed14e31bd02c4c4dc971f9c900b9c2d39ca6fe Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Tue, 6 Jan 2026 11:23:53 +0800 Subject: [PATCH 24/58] naive chunking form implemented --- src/models/kimi-linear.cpp | 223 +++++++++++++++++++++++++++++++++++-- 1 file changed, 214 insertions(+), 9 deletions(-) diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index a943dd1dce..3fb40471a1 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -265,7 +265,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll // Choose between build_kda_chunking and build_kda_recurrent based on n_tokens // TODO: Currently only build_kda_recurrent is implemented ggml_tensor * attn_out = n_seq_tokens > CHUNK_SIZE ? - build_kda_recurrent(Qcur, Kcur, Vcur, g1, beta, state, causal_mask, identity, il) : + build_kda_chunking(Qcur, Kcur, Vcur, g1, beta, state, causal_mask, identity, il) : build_kda_recurrent(Qcur, Kcur, Vcur, g1, beta, state, causal_mask, identity, il); cb(attn_out, "attn_out", il); @@ -485,7 +485,7 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking( GGML_ASSERT(k->ne[2] == n_tokens); GGML_ASSERT(gk->ne[0] == S_v && gk->ne[1] == H_v && gk->ne[2] == n_tokens && gk->ne[3] == n_seqs); GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs); - GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v * H_v && state->ne[2] == 1 && state->ne[3] == n_seqs); + GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v && state->ne[2] == H_v && state->ne[3] == n_seqs); GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs); GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs); @@ -504,8 +504,6 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking( const float scale = 1.0f / sqrtf(S_v); - q = ggml_scale(ctx0, q, scale); - beta = ggml_sigmoid(ctx0, beta); cb(q, "q_in", il); @@ -514,8 +512,8 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking( cb(beta, "beta_in", il); cb(gk, "gk_in", il); - q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs); - k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs); + q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_k, n_tokens, H_k, n_seqs); + k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_k, n_tokens, H_k, n_seqs); v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs); gk = ggml_cont_4d(ctx0, ggml_permute(ctx0, gk, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs); @@ -530,20 +528,227 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking( cb(beta, "beta_perm", il); cb(gk, "gk_perm", il); cb(state, "state_in", il); - cb(causal_diag_mask, "causal_diag_mask", il); GGML_ASSERT(q->ne[1] == n_tokens && q->ne[0] == S_k && q->ne[2] == H_k && q->ne[3] == n_seqs); GGML_ASSERT(k->ne[1] == n_tokens && k->ne[0] == S_k && k->ne[2] == H_k && k->ne[3] == n_seqs); GGML_ASSERT(v->ne[1] == n_tokens && v->ne[0] == S_v && v->ne[2] == H_k && v->ne[3] == n_seqs); GGML_ASSERT(beta->ne[1] == n_tokens && beta->ne[2] == H_k && beta->ne[0] == 1 && beta->ne[3] == n_seqs); + // Do padding + const int64_t chunk_size = CHUNK_SIZE; + + const int64_t pad = (chunk_size - n_tokens % chunk_size) % chunk_size; + const int64_t n_chunks = (n_tokens + pad) / chunk_size; + + q = ggml_pad(ctx0, q, 0, pad, 0, 0); + k = ggml_pad(ctx0, k, 0, pad, 0, 0); + v = ggml_pad(ctx0, v, 0, pad, 0, 0); + gk = ggml_pad(ctx0, gk, 0, pad, 0, 0); + beta = ggml_pad(ctx0, beta, 0, pad, 0, 0); + + cb(q, "q_pad", il); + cb(k, "k_pad", il); + cb(v, "v_pad", il); + cb(beta, "beta_pad", il); + cb(gk, "gk_pad", il); + ggml_tensor * v_beta = ggml_mul(ctx0, v, beta); ggml_tensor * k_beta = ggml_mul(ctx0, k, beta); - cb(k_beta, "k_beta", il); cb(v_beta, "v_beta", il); + cb(k_beta, "k_beta", il); - return nullptr; + ggml_tensor * chunked_mask = + ggml_view_4d(ctx0, causal_mask, chunk_size, + chunk_size, causal_mask->ne[2], causal_mask->ne[3], + causal_mask->nb[1], causal_mask->nb[2], causal_mask->nb[3], 0); + + ggml_tensor * chunked_diag_mask = + ggml_view_4d(ctx0, causal_diag_mask, chunk_size, + chunk_size, causal_diag_mask->ne[2], causal_diag_mask->ne[3], + causal_diag_mask->nb[1], causal_diag_mask->nb[2], causal_diag_mask->nb[3], 0); + + ggml_tensor * chunked_identity = + ggml_view_4d(ctx0, identity, chunk_size, + chunk_size, identity->ne[2], identity->ne[3], + identity->nb[1], identity->nb[2], identity->nb[3], 0); + + const int64_t HB = H_k * n_seqs; + + q = ggml_cont_4d(ctx0, q, S_k, chunk_size, n_chunks, HB); + k = ggml_cont_4d(ctx0, k, S_k, chunk_size, n_chunks, HB); + k_beta = ggml_cont_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, HB); + v = ggml_cont_4d(ctx0, v, S_v, chunk_size, n_chunks, HB); + v_beta = ggml_cont_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, HB); + + gk = ggml_cont_4d(ctx0, gk, S_k, chunk_size, n_chunks, HB); + beta = ggml_cont_4d(ctx0, beta, 1, chunk_size, n_chunks, HB); + + // switch for cumsum + gk = ggml_cont_4d(ctx0, ggml_permute(ctx0, gk, 1, 0, 2, 3), chunk_size, S_k, n_chunks, HB); + ggml_tensor * gk_cumsum = ggml_cumsum(ctx0, gk); + cb(gk_cumsum, "gk_cumsum", il); + + const int64_t CHB = n_chunks * H_v * n_seqs; + + ggml_tensor * g_i = ggml_reshape_4d(ctx0, gk_cumsum, chunk_size, 1, S_k, CHB); + ggml_tensor * g_j = ggml_reshape_4d(ctx0, gk_cumsum, 1, chunk_size, S_k, CHB); + + ggml_tensor * g_j_bc = ggml_repeat_4d(ctx0, g_j, chunk_size, chunk_size, S_k, CHB); + + ggml_tensor * decay_mask = ggml_sub(ctx0, g_j_bc, g_i); + + cb(decay_mask, "decay_mask", il); + + decay_mask = ggml_mul(ctx0, decay_mask, chunked_diag_mask); + decay_mask = ggml_exp(ctx0, decay_mask); + decay_mask = ggml_mul(ctx0, decay_mask, chunked_diag_mask); + cb(decay_mask, "decay_mask_exp", il); + +// k [S,BT,NT,H*B] k_per [BT,S,NT,H*B] + ggml_tensor * k_per = ggml_cont(ctx0, ggml_permute(ctx0, k, 1, 0, 2, 3)); + ggml_tensor * k_i = ggml_reshape_4d(ctx0, k_per, chunk_size, 1, S_k, CHB); + ggml_tensor * k_i_bc = ggml_repeat_4d(ctx0, k_i, chunk_size, chunk_size, S_k, CHB); + ggml_tensor * k_j = ggml_reshape_4d(ctx0, k_per, 1, chunk_size, S_k, CHB); + ggml_tensor * k_j_bc = ggml_repeat_4d(ctx0, k_j, chunk_size, chunk_size, S_k, CHB); + + ggml_tensor * Akk = ggml_mul(ctx0, decay_mask, k_j_bc); + Akk = ggml_mul(ctx0, Akk, k_i_bc); + + Akk = ggml_cont(ctx0, ggml_permute(ctx0, Akk, 1, 2, 0, 3)); + Akk = ggml_sum_rows(ctx0, Akk); + + Akk = ggml_reshape_4d(ctx0, Akk, chunk_size, chunk_size, n_chunks, H_k * n_seqs); + + Akk = ggml_mul(ctx0, Akk, beta); + Akk = ggml_neg(ctx0, ggml_mul(ctx0, Akk, chunked_mask)); + + cb(Akk, "attn_pre_solve", il); + + ggml_tensor * attn_lower = ggml_mul(ctx0, Akk, chunked_mask); + ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, chunked_identity, attn_lower), attn_lower); + + ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, Akk, true, true, false); + Akk = ggml_mul(ctx0, lin_solve, chunked_mask); + Akk = ggml_add(ctx0, Akk, chunked_identity); + + cb(Akk, "attn_solved", il); + + ggml_tensor * vb = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), Akk); + + gk_cumsum = ggml_cont_4d(ctx0, ggml_permute(ctx0, gk_cumsum, 1, 0, 2, 3), S_k, chunk_size, n_chunks, HB); + ggml_tensor * gkexp = ggml_exp(ctx0, gk_cumsum); + + ggml_tensor * kbeta_gkexp = ggml_mul(ctx0, k_beta, gkexp); + cb(kbeta_gkexp, "kbeta_gkexp", il); + + ggml_tensor * k_cumdecay = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gkexp)), Akk); + cb(k_cumdecay, "k_cumdecay", il); + + ggml_tensor * core_attn_out = nullptr; + ggml_tensor * new_state = ggml_dup(ctx0, state); + + cb(new_state, "new_state", il); + + for (int64_t chunk = 0; chunk < n_chunks; chunk++) { +// for (int64_t chunk = 0; chunk < 1; chunk++) { +// extract one chunk worth of data + auto chunkify = [=](ggml_tensor * t) { + return ggml_cont(ctx0, ggml_view_4d(ctx0, t, t->ne[0], chunk_size, 1, t->ne[3], + t->nb[1], t->nb[2], t->nb[3], t->nb[2] * chunk)); + }; + +// k [S,BT,NT,H*B] => k_chunk [S,BT,1,H*B] + ggml_tensor * k_chunk = chunkify(k); + ggml_tensor * q_chunk = chunkify(q); + ggml_tensor * vb_chunk = chunkify(vb); + + // Since decay_mask now has dimension of [BT,BT,S,NT*H*B], it can't be chunkified + // decay_mask_chunk needs to be recomputed +// gk_cumsum [S,BT,NT,H*B] => gk_cs_chunk [S,BT,1,H*B] + ggml_tensor * gk_cs_chunk = chunkify(gk_cumsum); + ggml_tensor * gk_cs_chunk_i = ggml_cont(ctx0, ggml_permute(ctx0, gk_cs_chunk, 2, 0, 1, 3)); + ggml_tensor * gk_cs_chunk_j = ggml_cont(ctx0, ggml_permute(ctx0, gk_cs_chunk, 2, 1, 0, 3)); + + ggml_tensor * gk_cs_chunk_j_bc = ggml_repeat_4d(ctx0, gk_cs_chunk_j, chunk_size, chunk_size, S_k, HB); + ggml_tensor * decay_mask_chunk = ggml_sub(ctx0, gk_cs_chunk_j_bc, gk_cs_chunk_i); + cb(decay_mask_chunk, "decay_mask_chunk", il); + decay_mask_chunk = ggml_mul(ctx0, decay_mask_chunk, chunked_diag_mask); + decay_mask_chunk = ggml_exp(ctx0, decay_mask_chunk); + decay_mask_chunk = ggml_mul(ctx0, decay_mask_chunk, chunked_diag_mask); + cb(decay_mask_chunk, "decay_mask_chunk_exp", il); + + ggml_tensor * k_cumdecay_chunk = chunkify(k_cumdecay); + + ggml_tensor * gkexp_chunk = ggml_exp(ctx0, gk_cs_chunk); + + ggml_tensor * k_chunk_i = ggml_cont(ctx0, ggml_permute(ctx0, k_chunk, 2, 0, 1, 3)); + ggml_tensor * k_chunk_i_bc = ggml_repeat_4d(ctx0, k_chunk_i, chunk_size, chunk_size, S_k, HB); + ggml_tensor * q_chunk_j = ggml_cont(ctx0, ggml_permute(ctx0, q_chunk, 2, 1, 0, 3)); + ggml_tensor * q_chunk_j_bc = ggml_repeat_4d(ctx0, q_chunk_j, chunk_size, chunk_size, S_k, HB); + ggml_tensor * kq = ggml_mul(ctx0, decay_mask_chunk, q_chunk_j_bc); + kq = ggml_mul(ctx0, kq, k_chunk_i_bc); + + ggml_tensor * Aqk = ggml_mul(ctx0, kq, decay_mask_chunk); + Aqk = ggml_mul(ctx0, Aqk, ggml_add(ctx0, chunked_identity, chunked_mask)); + Aqk = ggml_cont(ctx0, ggml_permute(ctx0, Aqk, 1, 2, 0, 3)); + Aqk = ggml_sum_rows(ctx0, Aqk); + Aqk = ggml_scale(ctx0, Aqk, scale); // scale q + Aqk = ggml_reshape_4d(ctx0, Aqk, chunk_size, chunk_size, 1, HB); + + ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs); + +// new_state [S,S,1,H*B] k_cumdecay_chunk [S,BT,1,H*B] + ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay_chunk); + + ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, vb_chunk, v_prime), v_prime); + ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new)); + +// q_chunk [S,BT,1,H*B] gkexp_chunk [S,BT,1,H*B] + ggml_tensor * q_gk_exp = ggml_mul(ctx0, q_chunk, gkexp_chunk); + ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_gk_exp); + attn_inter = ggml_scale(ctx0, attn_inter, scale); // scale q + +// v_new_t [S,BT,1,H*B] Aqk [BT,BT,1,H*B] + ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, Aqk); + + ggml_tensor * core_attn_out_chunk = ggml_add(ctx0, attn_inter, v_attn); + + core_attn_out = core_attn_out == nullptr ? core_attn_out_chunk : ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 1); + + ggml_tensor * gk_cum_last = + ggml_cont(ctx0, ggml_view_4d(ctx0, gk_cs_chunk, gk_cs_chunk->ne[0], 1, gk_cs_chunk->ne[2], gk_cs_chunk->ne[3], + gk_cs_chunk->nb[1], gk_cs_chunk->nb[2], gk_cs_chunk->nb[3], + gk_cs_chunk->nb[1] * (gk_cs_chunk->ne[1] - 1))); + + ggml_tensor * gkexp_last = ggml_exp(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, gk_cum_last))); + + ggml_tensor * gk_diff = ggml_neg(ctx0, ggml_sub(ctx0, gk_cs_chunk, gk_cum_last)); + + ggml_tensor * gk_diff_exp = ggml_exp(ctx0, gk_diff); + + ggml_tensor * key_gkdiff = ggml_mul(ctx0, k_chunk, gk_diff_exp); + + ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, ggml_cont(ctx0, ggml_transpose(ctx0, key_gkdiff))); + + new_state = ggml_add(ctx0, + ggml_mul(ctx0, new_state, ggml_reshape_4d(ctx0, gkexp_last, gkexp_last->ne[0], gkexp_last->ne[1], H_v, n_seqs)), + ggml_reshape_4d(ctx0, kgdmulvnew, kgdmulvnew->ne[0], kgdmulvnew->ne[1], H_v, n_seqs)); + } + + core_attn_out = ggml_cont_4d(ctx0, core_attn_out, S_v, chunk_size * n_chunks, H_v, n_seqs); + + ggml_tensor * output_tokens = ggml_view_4d(ctx0, core_attn_out, S_v, n_tokens, H_v, n_seqs, core_attn_out->nb[1], core_attn_out->nb[2], core_attn_out->nb[3], 0); + cb(output_tokens, "output_tokens", il); + + // flatten output + ggml_tensor * flat_output = + ggml_cont_1d(ctx0, ggml_permute(ctx0, output_tokens, 0, 2, 1, 3), S_v * H_v * n_tokens * n_seqs); + + ggml_tensor * flat_state = ggml_cont_1d(ctx0, new_state, S_v * S_v * H_v * n_seqs); + cb(new_state, "output_state", il); + + return ggml_concat(ctx0, flat_output, flat_state, 0); } ggml_tensor * llm_build_kimi_linear::build_kda_recurrent( From e3542ff8a27384dc6e25d519f0336aedab9a046b Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Tue, 6 Jan 2026 11:35:25 +0800 Subject: [PATCH 25/58] fixed some comments --- src/models/kimi-linear.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index 3fb40471a1..013926e544 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -263,7 +263,6 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll ggml_tensor * state = build_rs(inp_rs, ssm_states_all, hparams.n_embd_s(), n_seqs); state = ggml_reshape_4d(ctx0, state, head_dim, head_dim, n_head, n_seqs); // Choose between build_kda_chunking and build_kda_recurrent based on n_tokens - // TODO: Currently only build_kda_recurrent is implemented ggml_tensor * attn_out = n_seq_tokens > CHUNK_SIZE ? build_kda_chunking(Qcur, Kcur, Vcur, g1, beta, state, causal_mask, identity, il) : build_kda_recurrent(Qcur, Kcur, Vcur, g1, beta, state, causal_mask, identity, il); @@ -315,7 +314,6 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll } else if (is_mla) { // === MLA Layer (Multi-head Latent Attention) without KV Cache === // Reference: vLLM mla.py - // TODO: Implement proper KV caching for MLA (requires custom cache format) // Step 1: Q projection and reshape // vLLM Kimi: q = q_proj(hidden_states), then view as [n_tokens, n_head, qk_head_dim] @@ -454,7 +452,8 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll } /* - IMPORTANT: Currently build_kda_chunking is not implemented nor called + This is a ggml implementation of the naive_chunk_kda function of + https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/kda/naive.py */ ggml_tensor * llm_build_kimi_linear::build_kda_chunking( ggml_tensor * q, From 67bee56013ae0c1c68200ddbbea80f91742828b6 Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Tue, 6 Jan 2026 21:15:12 +0800 Subject: [PATCH 26/58] add Kimi-K2 specific tokens to be recognized as EOG --- src/llama-vocab.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 7af74b0218..08d559e90f 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -2203,6 +2203,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { || t.first == "<|end_of_text|>" // granite || t.first == "" || t.first == "_" + || t.first == "_" + || t.first == "[EOT]" // Kimi-K2 || t.first == "<|end▁of▁sentence|>" // DeepSeek || t.first == "" // smoldocling ) { @@ -2296,6 +2298,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { || t.first == "" || t.first == "" // Granite || t.first == "" + || t.first == "[PAD]" // Kimi-K2 ) { special_fim_pad_id = t.second; if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { @@ -2368,6 +2371,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { || t.first == "<|eom_id|>" || t.first == "" || t.first == "_" + || t.first == "[EOT]" // Kimi-K2 + || t.first == "[EOS]" // Kimi-K2 || t.first == "<|end_of_text|>" || t.first == "" // smoldocling ) { From 1099cbf694a8d5d85b6ebd0852c21b53bad2ccce Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Wed, 7 Jan 2026 18:42:31 +0800 Subject: [PATCH 27/58] build_kda_autoregressive is implemented to replace build_kda_recurrent for faster inference. sync'd to b7682 --- src/models/kimi-linear.cpp | 355 +++++++++++-------------------------- src/models/models.h | 9 +- 2 files changed, 110 insertions(+), 254 deletions(-) diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index 013926e544..270f9e6e6b 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -20,14 +20,16 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll // Output ids for selecting which tokens to output ggml_tensor * inp_out_ids = build_inp_out_ids(); - ggml_tensor * causal_mask = - ggml_tri(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, ubatch.n_seq_tokens, ubatch.n_seq_tokens), 1.0f), + ggml_tensor * chunked_causal_mask = + ggml_tri(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, CHUNK_SIZE, CHUNK_SIZE), 1.0f), GGML_TRI_TYPE_LOWER); - ggml_tensor * identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, ubatch.n_seq_tokens), 1.0f)); + ggml_tensor * chunked_identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, CHUNK_SIZE), 1.0f)); + ggml_tensor * chunked_diag_mask = ggml_add(ctx0, chunked_causal_mask, chunked_identity); - ggml_build_forward_expand(gf, causal_mask); - ggml_build_forward_expand(gf, identity); + ggml_build_forward_expand(gf, chunked_causal_mask); + ggml_build_forward_expand(gf, chunked_identity); + ggml_build_forward_expand(gf, chunked_diag_mask); // Kimi dimension constants const int64_t n_head = hparams.n_head(); @@ -263,9 +265,9 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll ggml_tensor * state = build_rs(inp_rs, ssm_states_all, hparams.n_embd_s(), n_seqs); state = ggml_reshape_4d(ctx0, state, head_dim, head_dim, n_head, n_seqs); // Choose between build_kda_chunking and build_kda_recurrent based on n_tokens - ggml_tensor * attn_out = n_seq_tokens > CHUNK_SIZE ? - build_kda_chunking(Qcur, Kcur, Vcur, g1, beta, state, causal_mask, identity, il) : - build_kda_recurrent(Qcur, Kcur, Vcur, g1, beta, state, causal_mask, identity, il); + ggml_tensor * attn_out = n_seq_tokens == 1 ? + build_kda_autoregressive(Qcur, Kcur, Vcur, g1, beta, state, il) : + build_kda_chunking(Qcur, Kcur, Vcur, g1, beta, state, chunked_causal_mask, chunked_identity, chunked_diag_mask, il); cb(attn_out, "attn_out", il); // The tensors were concatenated 1d, so we need to extract them 1d as well @@ -464,6 +466,7 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking( ggml_tensor * state, ggml_tensor * causal_mask, ggml_tensor * identity, + ggml_tensor * diag_mask, int il) { GGML_ASSERT(ggml_is_contiguous(q)); GGML_ASSERT(ggml_is_contiguous(k)); @@ -519,8 +522,6 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking( beta = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3)); state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs); - ggml_tensor * causal_diag_mask = ggml_add(ctx0, causal_mask, identity); - cb(q, "q_perm", il); cb(k, "k_perm", il); cb(v, "v_perm", il); @@ -557,21 +558,6 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking( cb(v_beta, "v_beta", il); cb(k_beta, "k_beta", il); - ggml_tensor * chunked_mask = - ggml_view_4d(ctx0, causal_mask, chunk_size, - chunk_size, causal_mask->ne[2], causal_mask->ne[3], - causal_mask->nb[1], causal_mask->nb[2], causal_mask->nb[3], 0); - - ggml_tensor * chunked_diag_mask = - ggml_view_4d(ctx0, causal_diag_mask, chunk_size, - chunk_size, causal_diag_mask->ne[2], causal_diag_mask->ne[3], - causal_diag_mask->nb[1], causal_diag_mask->nb[2], causal_diag_mask->nb[3], 0); - - ggml_tensor * chunked_identity = - ggml_view_4d(ctx0, identity, chunk_size, - chunk_size, identity->ne[2], identity->ne[3], - identity->nb[1], identity->nb[2], identity->nb[3], 0); - const int64_t HB = H_k * n_seqs; q = ggml_cont_4d(ctx0, q, S_k, chunk_size, n_chunks, HB); @@ -588,6 +574,14 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking( ggml_tensor * gk_cumsum = ggml_cumsum(ctx0, gk); cb(gk_cumsum, "gk_cumsum", il); +/* + https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/kda/naive.py + + for i in range(T): + k_i = k[..., i, :] + g_i = g[..., i:i+1, :] + A[..., i] = torch.einsum('... c d, ... d -> ... c', k * (g - g_i).exp(), k_i) +*/ const int64_t CHB = n_chunks * H_v * n_seqs; ggml_tensor * g_i = ggml_reshape_4d(ctx0, gk_cumsum, chunk_size, 1, S_k, CHB); @@ -599,9 +593,9 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking( cb(decay_mask, "decay_mask", il); - decay_mask = ggml_mul(ctx0, decay_mask, chunked_diag_mask); + decay_mask = ggml_mul(ctx0, decay_mask, diag_mask); decay_mask = ggml_exp(ctx0, decay_mask); - decay_mask = ggml_mul(ctx0, decay_mask, chunked_diag_mask); + decay_mask = ggml_mul(ctx0, decay_mask, diag_mask); cb(decay_mask, "decay_mask_exp", il); // k [S,BT,NT,H*B] k_per [BT,S,NT,H*B] @@ -620,19 +614,27 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking( Akk = ggml_reshape_4d(ctx0, Akk, chunk_size, chunk_size, n_chunks, H_k * n_seqs); Akk = ggml_mul(ctx0, Akk, beta); - Akk = ggml_neg(ctx0, ggml_mul(ctx0, Akk, chunked_mask)); + Akk = ggml_neg(ctx0, ggml_mul(ctx0, Akk, causal_mask)); cb(Akk, "attn_pre_solve", il); - ggml_tensor * attn_lower = ggml_mul(ctx0, Akk, chunked_mask); - ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, chunked_identity, attn_lower), attn_lower); + // for i in range(1, chunk_size): + // row = attn[..., i, :i].clone() + // sub = attn[..., :i, :i].clone() + // attn[..., i, :i] = row + (row.unsqueeze(-1) * sub).sum(-2) + // attn = attn + torch.eye(chunk_size, dtype=attn.dtype, device=attn.device) + // + // We reduce this to a linear triangular solve: AX = B, where B = attn, A = I - tril(A) + ggml_tensor * attn_lower = ggml_mul(ctx0, Akk, causal_mask); + ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower); ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, Akk, true, true, false); - Akk = ggml_mul(ctx0, lin_solve, chunked_mask); - Akk = ggml_add(ctx0, Akk, chunked_identity); + Akk = ggml_mul(ctx0, lin_solve, causal_mask); + Akk = ggml_add(ctx0, Akk, identity); cb(Akk, "attn_solved", il); + // u = (A*beta[..., None, :]) @ v aka U_[t] ggml_tensor * vb = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), Akk); gk_cumsum = ggml_cont_4d(ctx0, ggml_permute(ctx0, gk_cumsum, 1, 0, 2, 3), S_k, chunk_size, n_chunks, HB); @@ -650,7 +652,6 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking( cb(new_state, "new_state", il); for (int64_t chunk = 0; chunk < n_chunks; chunk++) { -// for (int64_t chunk = 0; chunk < 1; chunk++) { // extract one chunk worth of data auto chunkify = [=](ggml_tensor * t) { return ggml_cont(ctx0, ggml_view_4d(ctx0, t, t->ne[0], chunk_size, 1, t->ne[3], @@ -672,15 +673,22 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking( ggml_tensor * gk_cs_chunk_j_bc = ggml_repeat_4d(ctx0, gk_cs_chunk_j, chunk_size, chunk_size, S_k, HB); ggml_tensor * decay_mask_chunk = ggml_sub(ctx0, gk_cs_chunk_j_bc, gk_cs_chunk_i); cb(decay_mask_chunk, "decay_mask_chunk", il); - decay_mask_chunk = ggml_mul(ctx0, decay_mask_chunk, chunked_diag_mask); + decay_mask_chunk = ggml_mul(ctx0, decay_mask_chunk, diag_mask); decay_mask_chunk = ggml_exp(ctx0, decay_mask_chunk); - decay_mask_chunk = ggml_mul(ctx0, decay_mask_chunk, chunked_diag_mask); + decay_mask_chunk = ggml_mul(ctx0, decay_mask_chunk, diag_mask); cb(decay_mask_chunk, "decay_mask_chunk_exp", il); ggml_tensor * k_cumdecay_chunk = chunkify(k_cumdecay); ggml_tensor * gkexp_chunk = ggml_exp(ctx0, gk_cs_chunk); +/* + https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/kda/naive.py + for j in range(BT): + k_j = k[:, :, i, j] + g_j = g[:, :, i, j:j+1, :] + A[..., j] = torch.einsum('... c d, ... d -> ... c', q_i * (g_i - g_j).exp(), k_j) +*/ ggml_tensor * k_chunk_i = ggml_cont(ctx0, ggml_permute(ctx0, k_chunk, 2, 0, 1, 3)); ggml_tensor * k_chunk_i_bc = ggml_repeat_4d(ctx0, k_chunk_i, chunk_size, chunk_size, S_k, HB); ggml_tensor * q_chunk_j = ggml_cont(ctx0, ggml_permute(ctx0, q_chunk, 2, 1, 0, 3)); @@ -689,7 +697,7 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking( kq = ggml_mul(ctx0, kq, k_chunk_i_bc); ggml_tensor * Aqk = ggml_mul(ctx0, kq, decay_mask_chunk); - Aqk = ggml_mul(ctx0, Aqk, ggml_add(ctx0, chunked_identity, chunked_mask)); + Aqk = ggml_mul(ctx0, Aqk, ggml_add(ctx0, identity, causal_mask)); Aqk = ggml_cont(ctx0, ggml_permute(ctx0, Aqk, 1, 2, 0, 3)); Aqk = ggml_sum_rows(ctx0, Aqk); Aqk = ggml_scale(ctx0, Aqk, scale); // scale q @@ -697,20 +705,26 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking( ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs); -// new_state [S,S,1,H*B] k_cumdecay_chunk [S,BT,1,H*B] + // new_state [S,S,1,H*B] k_cumdecay_chunk [S,BT,1,H*B] + // v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state or W_[t] @ S_[t] ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay_chunk); + // v_new = v_i - v_prime or U_[t] - W_[t]*S_[t] ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, vb_chunk, v_prime), v_prime); ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new)); -// q_chunk [S,BT,1,H*B] gkexp_chunk [S,BT,1,H*B] + // q_chunk [S,BT,1,H*B] gkexp_chunk [S,BT,1,H*B] + // attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state + // or Gamma_[t]*Q_]t] @ S ggml_tensor * q_gk_exp = ggml_mul(ctx0, q_chunk, gkexp_chunk); ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_gk_exp); attn_inter = ggml_scale(ctx0, attn_inter, scale); // scale q -// v_new_t [S,BT,1,H*B] Aqk [BT,BT,1,H*B] + // v_new_t [S,BT,1,H*B] Aqk [BT,BT,1,H*B] + // core_attn_out[:, :, i] = attn_inter + attn @ v_new or A' @ (U_[t] - W_[t]*S_[t]) ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, Aqk); + // o[:, :, i] = (q_i * g_i.exp()) @ S + A @ v_i ggml_tensor * core_attn_out_chunk = ggml_add(ctx0, attn_inter, v_attn); core_attn_out = core_attn_out == nullptr ? core_attn_out_chunk : ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 1); @@ -728,6 +742,7 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking( ggml_tensor * key_gkdiff = ggml_mul(ctx0, k_chunk, gk_diff_exp); + // rearrange((g_i[:,:,-1:] - g_i).exp()*k_i, 'b h c k -> b h k c') @ (U_[t] - W_[t] @ S) ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, ggml_cont(ctx0, ggml_transpose(ctx0, key_gkdiff))); new_state = ggml_add(ctx0, @@ -750,256 +765,98 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking( return ggml_concat(ctx0, flat_output, flat_state, 0); } -ggml_tensor * llm_build_kimi_linear::build_kda_recurrent( +ggml_tensor * llm_build_kimi_linear::build_kda_autoregressive( ggml_tensor * q, ggml_tensor * k, ggml_tensor * v, ggml_tensor * gk, ggml_tensor * beta, - ggml_tensor * state, - ggml_tensor * causal_mask, - ggml_tensor * identity, + ggml_tensor * state, int il) { GGML_ASSERT(ggml_is_contiguous(q)); GGML_ASSERT(ggml_is_contiguous(k)); - GGML_ASSERT(ggml_is_contiguous(v)); + GGML_ASSERT(ggml_is_contiguous(v)); GGML_ASSERT(ggml_is_contiguous(gk)); GGML_ASSERT(ggml_is_contiguous(beta)); GGML_ASSERT(ggml_is_contiguous(state)); - + const int64_t S_k = q->ne[0]; const int64_t H_k = q->ne[1]; const int64_t n_tokens = q->ne[2]; const int64_t n_seqs = q->ne[3]; - + const int64_t S_v = v->ne[0]; const int64_t H_v = v->ne[1]; - + + GGML_ASSERT(n_tokens == 1); GGML_ASSERT(v->ne[2] == n_tokens); GGML_ASSERT(k->ne[2] == n_tokens); - GGML_ASSERT(gk->ne[0] == S_k && gk->ne[1] == H_v && gk->ne[2] == n_tokens && gk->ne[3] == n_seqs); + GGML_ASSERT(gk->ne[0] == S_k && gk->ne[1] == H_k && gk->ne[2] == n_tokens && gk->ne[3] == n_seqs); GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs); - GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v && state->ne[2] == H_v && state->ne[3] == n_seqs); - + GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_k && state->ne[2] == H_v && state->ne[3] == n_seqs); + GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs); GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs); - - GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case - - // TODO: can this ever be false? - const bool use_qk_l2norm = true; - - if (use_qk_l2norm) { - const float eps_norm = hparams.f_norm_rms_eps; - - q = ggml_l2_norm(ctx0, q, eps_norm); - k = ggml_l2_norm(ctx0, k, eps_norm); - } - - const float scale = 1.0f / sqrtf(S_v); - - beta = ggml_sigmoid(ctx0, beta); - ggml_tensor * causal_diag_mask = ggml_add(ctx0, causal_mask, identity); + GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case + + const float eps_norm = hparams.f_norm_rms_eps; + + q = ggml_l2_norm(ctx0, q, eps_norm); + k = ggml_l2_norm(ctx0, k, eps_norm); + + const float scale = 1.0f / sqrtf(S_v); + + q = ggml_scale(ctx0, q, scale); + beta = ggml_sigmoid(ctx0, beta); cb(q, "q_in", il); cb(k, "k_in", il); cb(v, "v_in", il); cb(beta, "beta_in", il); cb(gk, "gk_in", il); - - q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs); - k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs); - v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs); - gk = ggml_cont_4d(ctx0, ggml_permute(ctx0, gk, 1, 2, 0, 3), n_tokens, S_k, H_k, n_seqs); - - beta = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3)); - state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs); - - cb(q, "q_perm", il); - cb(k, "k_perm", il); - cb(v, "v_perm", il); - cb(beta, "beta_perm", il); - cb(gk, "gk_perm", il); - cb(state, "state_in", il); - - GGML_ASSERT(q->ne[1] == n_tokens && q->ne[0] == S_k && q->ne[2] == H_k && q->ne[3] == n_seqs); - GGML_ASSERT(k->ne[1] == n_tokens && k->ne[0] == S_k && k->ne[2] == H_k && k->ne[3] == n_seqs); - GGML_ASSERT(v->ne[1] == n_tokens && v->ne[0] == S_v && v->ne[2] == H_k && v->ne[3] == n_seqs); - GGML_ASSERT(beta->ne[1] == n_tokens && beta->ne[2] == H_k && beta->ne[0] == 1 && beta->ne[3] == n_seqs); - // ========================================================================= - // Compute cumulative sum of gk per key dimension - // gk_cumsum: [S_k, n_tokens, H_k, n_seqs] - cumsum along dim 1 (tokens) - // ========================================================================= - ggml_tensor * gk_cumsum = ggml_cumsum(ctx0, gk); - cb(gk_cumsum, "gk_cumsum", il); +// g [H,1,B,1] g_t [1,H,B,1] => [1,1,H,B] +// gk [S,H,1,B] => [S,1,H,B] gk_t [1,S,H,B] +// beta [H,1,1,B] beta_t [1,H,1,B] => [1,1,H,B] + gk = ggml_reshape_4d(ctx0, gk, S_k, 1, H_k, n_seqs); + ggml_tensor * gk_t = ggml_cont(ctx0, ggml_transpose(ctx0, gk)); + ggml_tensor * beta_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, beta), 1, 1, H_k, n_seqs); - // Scale k and k_beta - - ggml_tensor * k_beta = ggml_mul(ctx0, k, beta); - ggml_tensor * v_beta = ggml_mul(ctx0, v, beta); - - cb(k_beta, "k_beta", il); - cb(v_beta, "v_beta", il); + // Apply exponential to gk_t + gk_t = ggml_exp(ctx0, gk_t); + // Apply the gated delta rule for the single timestep + // last_recurrent_state = last_recurrent_state * gk_t + // S = S * g_i[..., None].exp() + state = ggml_mul(ctx0, state, gk_t); - -/* - https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/kda/naive.py - - for i in range(T): - k_i = k[..., i, :] - g_i = g[..., i:i+1, :] - A[..., i] = torch.einsum('... c d, ... d -> ... c', k * (g - g_i).exp(), k_i) -*/ - const int64_t HB = H_k * n_seqs; - ggml_tensor * k_per = ggml_cont(ctx0, ggml_permute(ctx0, k, 1, 0, 2, 3)); - ggml_tensor * k_i = ggml_reshape_4d(ctx0, k_per, n_tokens, 1, S_k, HB); - ggml_tensor * k_i_bc = ggml_repeat_4d(ctx0, k_i, n_tokens, n_tokens, S_k, HB); - ggml_tensor * g_i = ggml_reshape_4d(ctx0, gk_cumsum, n_tokens, 1, S_k, HB); - ggml_tensor * g_i_bc = ggml_repeat_4d(ctx0, g_i, n_tokens, n_tokens, S_k, HB); // [S_k, chunk_size, 1, HB] -> [S_k, chunk_size, chunk_size, HB] - - ggml_tensor * k_j = ggml_reshape_4d(ctx0, k_per, 1, n_tokens, S_k, HB); - ggml_tensor * k_j_bc = ggml_repeat_4d(ctx0, k_j, n_tokens, n_tokens, S_k, HB); - - ggml_tensor * g_j = ggml_reshape_4d(ctx0, gk_cumsum, 1, n_tokens, S_k, HB); - ggml_tensor * g_j_bc = ggml_repeat_4d(ctx0, g_j, n_tokens, n_tokens, S_k, HB); // [S_k, 1, chunk_size, HB] -> [S_k, chunk_size, chunk_size, HB] - - ggml_tensor * decay_mask = ggml_sub(ctx0, g_j_bc, g_i_bc); - cb(decay_mask, "decay_mask", il); - decay_mask = ggml_mul(ctx0, decay_mask, causal_diag_mask); - decay_mask = ggml_exp(ctx0, decay_mask); - decay_mask = ggml_mul(ctx0, decay_mask, causal_diag_mask); - cb(decay_mask, "decay_mask_exp", il); - - ggml_tensor * Akk = ggml_mul(ctx0, decay_mask, k_j_bc); - Akk = ggml_mul(ctx0, Akk, k_i_bc); - - Akk = ggml_cont(ctx0, ggml_permute(ctx0, Akk, 1, 2, 0, 3)); - Akk = ggml_sum_rows(ctx0, Akk); - - Akk = ggml_reshape_4d(ctx0, Akk, n_tokens, n_tokens, H_k, n_seqs); - - Akk = ggml_mul(ctx0, Akk, beta); - Akk = ggml_neg(ctx0, ggml_mul(ctx0, Akk, causal_mask)); - - cb(Akk, "attn_pre_rec", il); - - // for i in range(1, chunk_size): - // row = attn[..., i, :i].clone() - // sub = attn[..., :i, :i].clone() - // attn[..., i, :i] = row + (row.unsqueeze(-1) * sub).sum(-2) - // attn = attn + torch.eye(chunk_size, dtype=attn.dtype, device=attn.device) - // - // We reduce this to a linear triangular solve: AX = B, where B = attn, A = I - tril(A) - ggml_tensor * attn_lower = ggml_mul(ctx0, Akk, causal_mask); - ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower); - - ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, Akk, true, true, false); - Akk = ggml_mul(ctx0, lin_solve, causal_mask); - Akk = ggml_add(ctx0, Akk, identity); - - gk_cumsum = ggml_cont(ctx0, ggml_permute(ctx0, gk_cumsum, 1, 0, 2, 3)); // back to [S_k, n_tokens, H_k, n_seqs] - - // u = (A*beta[..., None, :]) @ v aka U_[t] - ggml_tensor * vb = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), Akk); - cb(vb, "value_beta", il); - - // k_cumdecay = attn @ (k_beta * g.exp().unsqueeze(-1)) or W_[t] - ggml_tensor * gkexp = ggml_exp(ctx0, gk_cumsum); // [S,T,H,B] - - ggml_tensor * kbeta_gkexp = ggml_mul(ctx0, k_beta, gkexp); - cb(kbeta_gkexp, "kbeta_gkexp", il); - - ggml_tensor * k_cumdecay = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gkexp)), Akk); - cb(k_cumdecay, "k_cumdecay", il); - -/* - https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/kda/naive.py - - for j in range(BT): - k_j = k[:, :, i, j] - g_j = g[:, :, i, j:j+1, :] - A[..., j] = torch.einsum('... c d, ... d -> ... c', q_i * (g_i - g_j).exp(), k_j) -*/ - ggml_tensor * q_per = ggml_cont(ctx0, ggml_permute(ctx0, q, 1, 0, 2, 3)); - ggml_tensor * q_j = ggml_reshape_4d(ctx0, q_per, 1, n_tokens, S_k, HB); - ggml_tensor * q_j_bc = ggml_repeat_4d(ctx0, q_j, n_tokens, n_tokens, S_k, HB); - ggml_tensor * kq = ggml_mul(ctx0, decay_mask, q_j_bc); - kq = ggml_mul(ctx0, kq, k_i_bc); - kq = ggml_cont(ctx0, ggml_permute(ctx0, kq, 1, 2, 0, 3)); - - ggml_tensor * Aqk = ggml_sum_rows(ctx0, kq); - Aqk = ggml_cont(ctx0, ggml_reshape_4d(ctx0, Aqk, n_tokens, n_tokens, H_k, n_seqs)); - Aqk = ggml_mul(ctx0, Aqk, ggml_add(ctx0, identity, causal_mask)); - Aqk = ggml_scale(ctx0, Aqk, scale); // scale q - cb(Aqk, "attn_decay_key", il); - ggml_tensor * state_t = ggml_cont(ctx0, ggml_transpose(ctx0, state)); - - // v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state or W_[t] @ S_[t] - ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay); - cb(v_prime, "v_prime", il); +// state [S,S,H,B] k [S,1,H,B] k_state [S_v,1,H,B] + k = ggml_reshape_4d(ctx0, k, S_k, 1, H_k, n_seqs); + ggml_tensor * k_state = ggml_mul_mat(ctx0, state_t, k); - // v_new = v_i - v_prime or U_[t] - W_[t]*S_[t] - ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, vb, v_prime), v_prime); + // v_i - (k_i[..., None] * S).sum(-2) + v = ggml_reshape_4d(ctx0, v, S_v, 1, H_v, n_seqs); + ggml_tensor * v_diff = ggml_sub(ctx0, v, k_state); - // v_new_t [T.S.H,B] - ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new)); + // b_i[..., None] * k_i + ggml_tensor * k_beta = ggml_mul(ctx0, k, beta_t); - cb(v_new, "v_new", il); + // S = S + torch.einsum('b h k, b h v -> b h k v', b_i[..., None] * k_i, v_i - (k_i[..., None] * S).sum(-2)) + // v_diff_t [1,S_v,H,B] k_beta_t [1,S_k,H,B] state [S_v,S_k,H,B] + state = ggml_add(ctx0, state, ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_diff)), ggml_cont(ctx0, ggml_transpose(ctx0, k_beta)))); - // attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state - // or Gamma_[t]*Q_]t] @ S - ggml_tensor * q_gk_exp = ggml_mul(ctx0, q, gkexp); - ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_gk_exp); - // scale q at attn_inter as suggested in chunk_gla_fwd_kernel_o of - // github.com/fla-org/flash-linear-attention/fla/ops/gla/chunk.py - attn_inter = ggml_scale(ctx0, attn_inter, scale); // scale q - - cb(attn_inter, "attn_inter", il); - - // core_attn_out[:, :, i] = attn_inter + attn @ v_new or A' @ (U_[t] - W_[t]*S_[t]) - ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, Aqk); - - cb(v_attn, "v_attn", il); - - // o[:, :, i] = (q_i * g_i.exp()) @ S + A @ v_i - ggml_tensor * core_attn_out = ggml_add(ctx0, attn_inter, v_attn); - - cb(core_attn_out, "core_attn_out", il); - - ggml_tensor * gk_cum_last = - ggml_cont(ctx0, ggml_view_4d(ctx0, gk_cumsum, gk_cumsum->ne[0], 1, gk_cumsum->ne[2], gk_cumsum->ne[3], - gk_cumsum->nb[1], gk_cumsum->nb[2], gk_cumsum->nb[3], - gk_cumsum->nb[1] * (gk_cumsum->ne[1] - 1))); - cb(gk_cum_last, "gk_cum_last", il); - - ggml_tensor * gkexp_last = ggml_exp(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, gk_cum_last))); - cb(gkexp_last, "gkexp_last", il); - - ggml_tensor * gk_diff = ggml_neg(ctx0, ggml_sub(ctx0, gk_cumsum, gk_cum_last)); - cb(gk_diff, "gk_diff", il); - - ggml_tensor * gk_diff_exp = ggml_exp(ctx0, gk_diff); - cb(gk_diff_exp, "gk_diff_exp", il); - - ggml_tensor * key_gkdiff = ggml_mul(ctx0, k, gk_diff_exp); - cb(key_gkdiff, "key_gkdiff", il); - - // rearrange((g_i[:,:,-1:] - g_i).exp()*k_i, 'b h c k -> b h k c') @ (U_[t] - W_[t] @ S) - ggml_tensor * kgkdmulvnew = ggml_mul_mat(ctx0, v_new_t, ggml_cont(ctx0, ggml_transpose(ctx0, key_gkdiff))); - cb(kgkdmulvnew, "kgkdmulvnew", il); - - state = ggml_add(ctx0, ggml_mul(ctx0, state, gkexp_last), kgkdmulvnew); + q = ggml_reshape_4d(ctx0, q, S_k, 1, H_k, n_seqs); + state_t = ggml_cont(ctx0, ggml_transpose(ctx0, state)); + ggml_tensor * core_attn_out = ggml_mul_mat(ctx0, state_t, q); + // core_attn_out should be [S_v, 1, H_v, n_seqs] after this + cb(core_attn_out, "output_tokens", il); cb(state, "new_state", il); - // flatten output - ggml_tensor * flat_output = - ggml_cont_1d(ctx0, ggml_permute(ctx0, core_attn_out, 0, 2, 1, 3), S_v * H_v * n_tokens * n_seqs); - - ggml_tensor * flat_state = ggml_cont_1d(ctx0, state, S_v * S_v * H_v * n_seqs); + // flatten output, no need to permute since n_tokens is 1 so [S_v, 1, H_v, n_seqs] and [S_v, H_v, 1, n_seqs] are equivalent memory-layout wise + ggml_tensor * flat_output = ggml_reshape_1d(ctx0, core_attn_out, S_v * H_v * n_tokens * n_seqs); + ggml_tensor * flat_state = ggml_reshape_1d(ctx0, state, S_v * S_v * H_v * n_seqs); return ggml_concat(ctx0, flat_output, flat_state, 0); } diff --git a/src/models/models.h b/src/models/models.h index ba2b905c5e..3ed00aae32 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -288,26 +288,25 @@ struct llm_build_kimi_linear : public llm_graph_context_mamba { llm_build_kimi_linear(const llama_model & model, const llm_graph_params & params); private: const llama_model & model; - ggml_tensor * build_kda_recurrent( + ggml_tensor * build_kda_autoregressive( ggml_tensor * q, ggml_tensor * k, ggml_tensor * v, - ggml_tensor * g, + ggml_tensor * gk, ggml_tensor * beta, ggml_tensor * state, - ggml_tensor * causal_mask, - ggml_tensor * identity, int il); ggml_tensor * build_kda_chunking( ggml_tensor * q, ggml_tensor * k, ggml_tensor * v, - ggml_tensor * g, + ggml_tensor * gk, ggml_tensor * beta, ggml_tensor * state, ggml_tensor * causal_mask, ggml_tensor * identity, + ggml_tensor * diag_mask, int il); }; From f99913dd5fb15027839d7986bd13b1e5dc4cb60f Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Thu, 8 Jan 2026 13:40:17 +0800 Subject: [PATCH 28/58] replaced Akk and Aqk with mul_mat and clamp --- src/models/kimi-linear.cpp | 105 +++++++++++++++---------------------- 1 file changed, 43 insertions(+), 62 deletions(-) diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index 270f9e6e6b..b229d31165 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -571,48 +571,40 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking( // switch for cumsum gk = ggml_cont_4d(ctx0, ggml_permute(ctx0, gk, 1, 0, 2, 3), chunk_size, S_k, n_chunks, HB); + cb(gk, "gk", il); ggml_tensor * gk_cumsum = ggml_cumsum(ctx0, gk); cb(gk_cumsum, "gk_cumsum", il); + // switch back for downstream + gk_cumsum = ggml_cont_4d(ctx0, ggml_permute(ctx0, gk_cumsum, 1, 0, 2, 3), S_k, chunk_size, n_chunks, HB); + ggml_tensor * gkexp = ggml_exp(ctx0, gk_cumsum); + + cb(gk_cumsum, "gk_cumsum", il); /* - https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/kda/naive.py - - for i in range(T): - k_i = k[..., i, :] - g_i = g[..., i:i+1, :] + for i in range(BT): + k_i = k[..., i, :] # k_i [B,H,NT,S] + g_i = g[..., i:i+1, :] # g_i [B,H,NT,1,S] A[..., i] = torch.einsum('... c d, ... d -> ... c', k * (g - g_i).exp(), k_i) */ - const int64_t CHB = n_chunks * H_v * n_seqs; + // gk_ref: [S, 1, C, HB] - first token of i_block + ggml_tensor * gk_ref = ggml_view_4d(ctx0, gk_cumsum, + S_k, 1, n_chunks, HB, + gk_cumsum->nb[1], gk_cumsum->nb[2], gk_cumsum->nb[3], + 0); + cb(gk_ref, "gk_ref", il); - ggml_tensor * g_i = ggml_reshape_4d(ctx0, gk_cumsum, chunk_size, 1, S_k, CHB); - ggml_tensor * g_j = ggml_reshape_4d(ctx0, gk_cumsum, 1, chunk_size, S_k, CHB); - - ggml_tensor * g_j_bc = ggml_repeat_4d(ctx0, g_j, chunk_size, chunk_size, S_k, CHB); - - ggml_tensor * decay_mask = ggml_sub(ctx0, g_j_bc, g_i); - - cb(decay_mask, "decay_mask", il); - - decay_mask = ggml_mul(ctx0, decay_mask, diag_mask); - decay_mask = ggml_exp(ctx0, decay_mask); - decay_mask = ggml_mul(ctx0, decay_mask, diag_mask); - cb(decay_mask, "decay_mask_exp", il); - -// k [S,BT,NT,H*B] k_per [BT,S,NT,H*B] - ggml_tensor * k_per = ggml_cont(ctx0, ggml_permute(ctx0, k, 1, 0, 2, 3)); - ggml_tensor * k_i = ggml_reshape_4d(ctx0, k_per, chunk_size, 1, S_k, CHB); - ggml_tensor * k_i_bc = ggml_repeat_4d(ctx0, k_i, chunk_size, chunk_size, S_k, CHB); - ggml_tensor * k_j = ggml_reshape_4d(ctx0, k_per, 1, chunk_size, S_k, CHB); - ggml_tensor * k_j_bc = ggml_repeat_4d(ctx0, k_j, chunk_size, chunk_size, S_k, CHB); - - ggml_tensor * Akk = ggml_mul(ctx0, decay_mask, k_j_bc); - Akk = ggml_mul(ctx0, Akk, k_i_bc); - - Akk = ggml_cont(ctx0, ggml_permute(ctx0, Akk, 1, 2, 0, 3)); - Akk = ggml_sum_rows(ctx0, Akk); - - Akk = ggml_reshape_4d(ctx0, Akk, chunk_size, chunk_size, n_chunks, H_k * n_seqs); + // Compute gk_diff + ggml_tensor * gk_diff_j = ggml_sub(ctx0, gk_cumsum, ggml_repeat(ctx0, gk_ref, gk_cumsum)); + ggml_tensor * gk_diff_i = ggml_clamp(ctx0, ggml_neg(ctx0, gk_diff_j), 0.0f, 88.0f); + cb(gk_diff_j, "gk_diff_j", il); + cb(gk_diff_i, "gk_diff_i", il); + // Decay k + ggml_tensor * k_exp_j = ggml_mul(ctx0, k, ggml_exp(ctx0, gk_diff_j)); + ggml_tensor * k_exp_i = ggml_mul(ctx0, k, ggml_exp(ctx0, gk_diff_i)); + ggml_tensor * Akk = ggml_mul_mat(ctx0, k_exp_i, k_exp_j); + cb(Akk, "Akk", il); + Akk = ggml_mul(ctx0, Akk, beta); Akk = ggml_neg(ctx0, ggml_mul(ctx0, Akk, causal_mask)); @@ -637,9 +629,6 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking( // u = (A*beta[..., None, :]) @ v aka U_[t] ggml_tensor * vb = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), Akk); - gk_cumsum = ggml_cont_4d(ctx0, ggml_permute(ctx0, gk_cumsum, 1, 0, 2, 3), S_k, chunk_size, n_chunks, HB); - ggml_tensor * gkexp = ggml_exp(ctx0, gk_cumsum); - ggml_tensor * kbeta_gkexp = ggml_mul(ctx0, k_beta, gkexp); cb(kbeta_gkexp, "kbeta_gkexp", il); @@ -663,23 +652,9 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking( ggml_tensor * q_chunk = chunkify(q); ggml_tensor * vb_chunk = chunkify(vb); - // Since decay_mask now has dimension of [BT,BT,S,NT*H*B], it can't be chunkified - // decay_mask_chunk needs to be recomputed // gk_cumsum [S,BT,NT,H*B] => gk_cs_chunk [S,BT,1,H*B] ggml_tensor * gk_cs_chunk = chunkify(gk_cumsum); - ggml_tensor * gk_cs_chunk_i = ggml_cont(ctx0, ggml_permute(ctx0, gk_cs_chunk, 2, 0, 1, 3)); - ggml_tensor * gk_cs_chunk_j = ggml_cont(ctx0, ggml_permute(ctx0, gk_cs_chunk, 2, 1, 0, 3)); - - ggml_tensor * gk_cs_chunk_j_bc = ggml_repeat_4d(ctx0, gk_cs_chunk_j, chunk_size, chunk_size, S_k, HB); - ggml_tensor * decay_mask_chunk = ggml_sub(ctx0, gk_cs_chunk_j_bc, gk_cs_chunk_i); - cb(decay_mask_chunk, "decay_mask_chunk", il); - decay_mask_chunk = ggml_mul(ctx0, decay_mask_chunk, diag_mask); - decay_mask_chunk = ggml_exp(ctx0, decay_mask_chunk); - decay_mask_chunk = ggml_mul(ctx0, decay_mask_chunk, diag_mask); - cb(decay_mask_chunk, "decay_mask_chunk_exp", il); - ggml_tensor * k_cumdecay_chunk = chunkify(k_cumdecay); - ggml_tensor * gkexp_chunk = ggml_exp(ctx0, gk_cs_chunk); /* https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/kda/naive.py @@ -689,19 +664,25 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking( g_j = g[:, :, i, j:j+1, :] A[..., j] = torch.einsum('... c d, ... d -> ... c', q_i * (g_i - g_j).exp(), k_j) */ - ggml_tensor * k_chunk_i = ggml_cont(ctx0, ggml_permute(ctx0, k_chunk, 2, 0, 1, 3)); - ggml_tensor * k_chunk_i_bc = ggml_repeat_4d(ctx0, k_chunk_i, chunk_size, chunk_size, S_k, HB); - ggml_tensor * q_chunk_j = ggml_cont(ctx0, ggml_permute(ctx0, q_chunk, 2, 1, 0, 3)); - ggml_tensor * q_chunk_j_bc = ggml_repeat_4d(ctx0, q_chunk_j, chunk_size, chunk_size, S_k, HB); - ggml_tensor * kq = ggml_mul(ctx0, decay_mask_chunk, q_chunk_j_bc); - kq = ggml_mul(ctx0, kq, k_chunk_i_bc); + ggml_tensor * gk_ref_chunk = ggml_view_4d(ctx0, gk_cs_chunk, + S_k, 1, 1, HB, + gk_cs_chunk->nb[1], gk_cs_chunk->nb[2], gk_cs_chunk->nb[3], + 0); + // Compute gk_diff + ggml_tensor * gk_diff_chunk_j = ggml_sub(ctx0, gk_cs_chunk, ggml_repeat(ctx0, gk_ref_chunk, gk_cs_chunk)); + ggml_tensor * gk_diff_chunk_i = ggml_clamp(ctx0, ggml_neg(ctx0, gk_diff_chunk_j), 0.0f, 88.0f); + cb(gk_diff_chunk_j, "gk_diff_chunk_j", il); + cb(gk_diff_chunk_i, "gk_diff_chunk_i", il); - ggml_tensor * Aqk = ggml_mul(ctx0, kq, decay_mask_chunk); - Aqk = ggml_mul(ctx0, Aqk, ggml_add(ctx0, identity, causal_mask)); - Aqk = ggml_cont(ctx0, ggml_permute(ctx0, Aqk, 1, 2, 0, 3)); - Aqk = ggml_sum_rows(ctx0, Aqk); + // Decay q and k + ggml_tensor * q_exp_chunk = ggml_mul(ctx0, q_chunk, ggml_exp(ctx0, gk_diff_chunk_j)); + ggml_tensor * k_exp_chunk = ggml_mul(ctx0, k_chunk, ggml_exp(ctx0, gk_diff_chunk_i)); + + ggml_tensor * Aqk = ggml_mul_mat(ctx0, k_exp_chunk, q_exp_chunk); + cb(Aqk, "Aqk", il); + Aqk = ggml_mul(ctx0, Aqk, diag_mask); Aqk = ggml_scale(ctx0, Aqk, scale); // scale q - Aqk = ggml_reshape_4d(ctx0, Aqk, chunk_size, chunk_size, 1, HB); + cb(Aqk, "Aqk_masked", il); ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs); From 6150bb7b17fe15a7b8cac2c26f3da9e38dc72b5d Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Fri, 9 Jan 2026 20:11:45 +0800 Subject: [PATCH 29/58] no clamp version --- src/models/kimi-linear.cpp | 77 +++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 34 deletions(-) diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index b229d31165..93a4983c66 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -574,11 +574,6 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking( cb(gk, "gk", il); ggml_tensor * gk_cumsum = ggml_cumsum(ctx0, gk); cb(gk_cumsum, "gk_cumsum", il); - // switch back for downstream - gk_cumsum = ggml_cont_4d(ctx0, ggml_permute(ctx0, gk_cumsum, 1, 0, 2, 3), S_k, chunk_size, n_chunks, HB); - ggml_tensor * gkexp = ggml_exp(ctx0, gk_cumsum); - - cb(gk_cumsum, "gk_cumsum", il); /* for i in range(BT): @@ -586,23 +581,31 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking( g_i = g[..., i:i+1, :] # g_i [B,H,NT,1,S] A[..., i] = torch.einsum('... c d, ... d -> ... c', k * (g - g_i).exp(), k_i) */ - // gk_ref: [S, 1, C, HB] - first token of i_block - ggml_tensor * gk_ref = ggml_view_4d(ctx0, gk_cumsum, - S_k, 1, n_chunks, HB, - gk_cumsum->nb[1], gk_cumsum->nb[2], gk_cumsum->nb[3], - 0); - cb(gk_ref, "gk_ref", il); + const int64_t CHB = n_chunks * H_k * n_seqs; + ggml_tensor * gkcs_i = ggml_reshape_4d(ctx0, gk_cumsum, chunk_size, 1, S_k, CHB); // [chunk_size, 1, S_k, CHB] + ggml_tensor * gkcs_j = ggml_reshape_4d(ctx0, gkcs_i, 1, chunk_size, S_k, CHB); // [1, chunk_size, S_k, CHB] - // Compute gk_diff - ggml_tensor * gk_diff_j = ggml_sub(ctx0, gk_cumsum, ggml_repeat(ctx0, gk_ref, gk_cumsum)); - ggml_tensor * gk_diff_i = ggml_clamp(ctx0, ggml_neg(ctx0, gk_diff_j), 0.0f, 88.0f); - cb(gk_diff_j, "gk_diff_j", il); - cb(gk_diff_i, "gk_diff_i", il); + ggml_tensor * gkcs_j_bc = ggml_repeat_4d(ctx0, gkcs_j, chunk_size, chunk_size, S_k, CHB); // [1, chunk_size, S_k, CHB] -> [chunk_size, chunk_size, S_k, CHB] + // decay_mask [chunk_size,chunk_size,S_k,CHB] + ggml_tensor * decay_mask = ggml_sub(ctx0, gkcs_j_bc, gkcs_i); + cb(decay_mask, "decay_mask", il); - // Decay k - ggml_tensor * k_exp_j = ggml_mul(ctx0, k, ggml_exp(ctx0, gk_diff_j)); - ggml_tensor * k_exp_i = ggml_mul(ctx0, k, ggml_exp(ctx0, gk_diff_i)); - ggml_tensor * Akk = ggml_mul_mat(ctx0, k_exp_i, k_exp_j); + decay_mask = ggml_mul(ctx0, decay_mask, diag_mask); + cb(decay_mask, "decay_masked", il); + decay_mask = ggml_exp(ctx0, decay_mask); + decay_mask = ggml_mul(ctx0, decay_mask, diag_mask); + + // decay_mask [S_k,BT_j,BT_i,CHB] *Note* second and third chunk_sizes are switched + decay_mask = ggml_cont_4d(ctx0, ggml_permute(ctx0, decay_mask, 2, 1, 0, 3), S_k, chunk_size, chunk_size, CHB); + + ggml_tensor * k_i = ggml_cont(ctx0, ggml_reshape_4d(ctx0, k, S_k, chunk_size, 1, CHB)); + ggml_tensor * k_j = ggml_cont(ctx0, ggml_reshape_4d(ctx0, k, S_k, 1, chunk_size, CHB)); + + ggml_tensor * decay_k_i = ggml_mul(ctx0, decay_mask, k_i); + + // decay_k_i [S.BT,BT,CHB] @ k_j [S,1,BT,CHB] = Akk [BT,1,BT,CHB] + ggml_tensor * Akk = ggml_mul_mat(ctx0, k_j, decay_k_i); + Akk = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_4d(ctx0, Akk, chunk_size, chunk_size, n_chunks, HB))); cb(Akk, "Akk", il); Akk = ggml_mul(ctx0, Akk, beta); @@ -626,6 +629,11 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking( cb(Akk, "attn_solved", il); + // switch back for downstream + gk_cumsum = ggml_cont_4d(ctx0, ggml_permute(ctx0, gk_cumsum, 1, 0, 2, 3), S_k, chunk_size, n_chunks, HB); + ggml_tensor * gkexp = ggml_exp(ctx0, gk_cumsum); + cb(gk_cumsum, "gk_cumsum", il); + // u = (A*beta[..., None, :]) @ v aka U_[t] ggml_tensor * vb = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), Akk); @@ -640,12 +648,19 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking( cb(new_state, "new_state", il); + // switch for chunkify_mask + decay_mask = ggml_cont(ctx0, ggml_reshape_4d(ctx0, decay_mask, S_k, chunk_size * chunk_size, n_chunks, HB)); for (int64_t chunk = 0; chunk < n_chunks; chunk++) { // extract one chunk worth of data auto chunkify = [=](ggml_tensor * t) { return ggml_cont(ctx0, ggml_view_4d(ctx0, t, t->ne[0], chunk_size, 1, t->ne[3], t->nb[1], t->nb[2], t->nb[3], t->nb[2] * chunk)); }; + auto chunkify_mask = [=](ggml_tensor * t) { + return ggml_cont(ctx0, ggml_view_4d(ctx0, t, t->ne[0], chunk_size*chunk_size, 1, t->ne[3], + t->nb[1], t->nb[2], t->nb[3], t->nb[2] * chunk)); + }; + // k [S,BT,NT,H*B] => k_chunk [S,BT,1,H*B] ggml_tensor * k_chunk = chunkify(k); @@ -656,6 +671,8 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking( ggml_tensor * gk_cs_chunk = chunkify(gk_cumsum); ggml_tensor * k_cumdecay_chunk = chunkify(k_cumdecay); ggml_tensor * gkexp_chunk = ggml_exp(ctx0, gk_cs_chunk); + ggml_tensor * decay_mask_chunk = chunkify_mask(decay_mask); + decay_mask_chunk = ggml_cont(ctx0, ggml_reshape_4d(ctx0, decay_mask_chunk, S_k, chunk_size, chunk_size, HB)); /* https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/kda/naive.py @@ -664,22 +681,14 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking( g_j = g[:, :, i, j:j+1, :] A[..., j] = torch.einsum('... c d, ... d -> ... c', q_i * (g_i - g_j).exp(), k_j) */ - ggml_tensor * gk_ref_chunk = ggml_view_4d(ctx0, gk_cs_chunk, - S_k, 1, 1, HB, - gk_cs_chunk->nb[1], gk_cs_chunk->nb[2], gk_cs_chunk->nb[3], - 0); - // Compute gk_diff - ggml_tensor * gk_diff_chunk_j = ggml_sub(ctx0, gk_cs_chunk, ggml_repeat(ctx0, gk_ref_chunk, gk_cs_chunk)); - ggml_tensor * gk_diff_chunk_i = ggml_clamp(ctx0, ggml_neg(ctx0, gk_diff_chunk_j), 0.0f, 88.0f); - cb(gk_diff_chunk_j, "gk_diff_chunk_j", il); - cb(gk_diff_chunk_i, "gk_diff_chunk_i", il); + ggml_tensor * k_j_chunk = ggml_cont(ctx0, ggml_reshape_4d(ctx0, k_chunk, S_k, 1, chunk_size, HB)); + ggml_tensor * q_i_chunk = ggml_cont(ctx0, ggml_reshape_4d(ctx0, q_chunk, S_k, chunk_size, 1, HB)); + ggml_tensor * decay_q_i_chunk = ggml_mul(ctx0, decay_mask_chunk, q_i_chunk); - // Decay q and k - ggml_tensor * q_exp_chunk = ggml_mul(ctx0, q_chunk, ggml_exp(ctx0, gk_diff_chunk_j)); - ggml_tensor * k_exp_chunk = ggml_mul(ctx0, k_chunk, ggml_exp(ctx0, gk_diff_chunk_i)); - - ggml_tensor * Aqk = ggml_mul_mat(ctx0, k_exp_chunk, q_exp_chunk); + ggml_tensor * Aqk = ggml_mul_mat(ctx0, decay_q_i_chunk, k_j_chunk); + Aqk = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_4d(ctx0, Aqk, chunk_size, chunk_size, 1, HB))); cb(Aqk, "Aqk", il); + Aqk = ggml_mul(ctx0, Aqk, diag_mask); Aqk = ggml_scale(ctx0, Aqk, scale); // scale q cb(Aqk, "Aqk_masked", il); From d26fe501786ed8e06d2b63f2d59f5daceaecfe87 Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Sat, 10 Jan 2026 08:45:57 +0800 Subject: [PATCH 30/58] Moved Aqk computation out of the loop --- src/models/kimi-linear.cpp | 49 ++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 28 deletions(-) diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index 93a4983c66..c55116bc69 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -576,10 +576,17 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking( cb(gk_cumsum, "gk_cumsum", il); /* + Compute Akk and Aqk loop together + Akk loop: for i in range(BT): k_i = k[..., i, :] # k_i [B,H,NT,S] g_i = g[..., i:i+1, :] # g_i [B,H,NT,1,S] A[..., i] = torch.einsum('... c d, ... d -> ... c', k * (g - g_i).exp(), k_i) + Aqk loop: + for j in range(BT): + k_j = k[:, :, i, j] + g_j = g[:, :, i, j:j+1, :] + A[..., j] = torch.einsum('... c d, ... d -> ... c', q_i * (g_i - g_j).exp(), k_j) */ const int64_t CHB = n_chunks * H_k * n_seqs; ggml_tensor * gkcs_i = ggml_reshape_4d(ctx0, gk_cumsum, chunk_size, 1, S_k, CHB); // [chunk_size, 1, S_k, CHB] @@ -600,19 +607,27 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking( ggml_tensor * k_i = ggml_cont(ctx0, ggml_reshape_4d(ctx0, k, S_k, chunk_size, 1, CHB)); ggml_tensor * k_j = ggml_cont(ctx0, ggml_reshape_4d(ctx0, k, S_k, 1, chunk_size, CHB)); + ggml_tensor * q_i = ggml_cont(ctx0, ggml_reshape_4d(ctx0, q, S_k, chunk_size, 1, CHB)); ggml_tensor * decay_k_i = ggml_mul(ctx0, decay_mask, k_i); + ggml_tensor * decay_q_i = ggml_mul(ctx0, decay_mask, q_i); // decay_k_i [S.BT,BT,CHB] @ k_j [S,1,BT,CHB] = Akk [BT,1,BT,CHB] - ggml_tensor * Akk = ggml_mul_mat(ctx0, k_j, decay_k_i); + ggml_tensor * Akk = ggml_mul_mat(ctx0, decay_k_i, k_j); + ggml_tensor * Aqk = ggml_mul_mat(ctx0, decay_q_i, k_j); Akk = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_4d(ctx0, Akk, chunk_size, chunk_size, n_chunks, HB))); + Aqk = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_4d(ctx0, Aqk, chunk_size, chunk_size, n_chunks, HB))); cb(Akk, "Akk", il); + cb(Aqk, "Aqk", il); Akk = ggml_mul(ctx0, Akk, beta); Akk = ggml_neg(ctx0, ggml_mul(ctx0, Akk, causal_mask)); - cb(Akk, "attn_pre_solve", il); + Aqk = ggml_mul(ctx0, Aqk, diag_mask); + Aqk = ggml_scale(ctx0, Aqk, scale); // scale q + cb(Aqk, "Aqk_masked", il); + // for i in range(1, chunk_size): // row = attn[..., i, :i].clone() // sub = attn[..., :i, :i].clone() @@ -648,16 +663,14 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking( cb(new_state, "new_state", il); - // switch for chunkify_mask - decay_mask = ggml_cont(ctx0, ggml_reshape_4d(ctx0, decay_mask, S_k, chunk_size * chunk_size, n_chunks, HB)); for (int64_t chunk = 0; chunk < n_chunks; chunk++) { // extract one chunk worth of data auto chunkify = [=](ggml_tensor * t) { return ggml_cont(ctx0, ggml_view_4d(ctx0, t, t->ne[0], chunk_size, 1, t->ne[3], t->nb[1], t->nb[2], t->nb[3], t->nb[2] * chunk)); }; - auto chunkify_mask = [=](ggml_tensor * t) { - return ggml_cont(ctx0, ggml_view_4d(ctx0, t, t->ne[0], chunk_size*chunk_size, 1, t->ne[3], + auto chunkify_A = [=](ggml_tensor * t) { + return ggml_cont(ctx0, ggml_view_4d(ctx0, t, chunk_size, chunk_size, 1, t->ne[3], t->nb[1], t->nb[2], t->nb[3], t->nb[2] * chunk)); }; @@ -671,27 +684,7 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking( ggml_tensor * gk_cs_chunk = chunkify(gk_cumsum); ggml_tensor * k_cumdecay_chunk = chunkify(k_cumdecay); ggml_tensor * gkexp_chunk = ggml_exp(ctx0, gk_cs_chunk); - ggml_tensor * decay_mask_chunk = chunkify_mask(decay_mask); - decay_mask_chunk = ggml_cont(ctx0, ggml_reshape_4d(ctx0, decay_mask_chunk, S_k, chunk_size, chunk_size, HB)); -/* - https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/kda/naive.py - - for j in range(BT): - k_j = k[:, :, i, j] - g_j = g[:, :, i, j:j+1, :] - A[..., j] = torch.einsum('... c d, ... d -> ... c', q_i * (g_i - g_j).exp(), k_j) -*/ - ggml_tensor * k_j_chunk = ggml_cont(ctx0, ggml_reshape_4d(ctx0, k_chunk, S_k, 1, chunk_size, HB)); - ggml_tensor * q_i_chunk = ggml_cont(ctx0, ggml_reshape_4d(ctx0, q_chunk, S_k, chunk_size, 1, HB)); - ggml_tensor * decay_q_i_chunk = ggml_mul(ctx0, decay_mask_chunk, q_i_chunk); - - ggml_tensor * Aqk = ggml_mul_mat(ctx0, decay_q_i_chunk, k_j_chunk); - Aqk = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_4d(ctx0, Aqk, chunk_size, chunk_size, 1, HB))); - cb(Aqk, "Aqk", il); - - Aqk = ggml_mul(ctx0, Aqk, diag_mask); - Aqk = ggml_scale(ctx0, Aqk, scale); // scale q - cb(Aqk, "Aqk_masked", il); + ggml_tensor * Aqk_chunk = chunkify_A(Aqk); ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs); @@ -712,7 +705,7 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking( // v_new_t [S,BT,1,H*B] Aqk [BT,BT,1,H*B] // core_attn_out[:, :, i] = attn_inter + attn @ v_new or A' @ (U_[t] - W_[t]*S_[t]) - ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, Aqk); + ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, Aqk_chunk); // o[:, :, i] = (q_i * g_i.exp()) @ S + A @ v_i ggml_tensor * core_attn_out_chunk = ggml_add(ctx0, attn_inter, v_attn); From dce064c0a3882b20d1b2a78c01cc181afa461e67 Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Sat, 10 Jan 2026 22:08:38 +0800 Subject: [PATCH 31/58] fixed typo and split wkv_b into wk_b and wv_b --- convert_hf_to_gguf.py | 25 +++++++++++++++++++++++-- gguf-py/gguf/constants.py | 2 ++ gguf-py/gguf/tensor_mapping.py | 4 ++-- 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 59ee156dd9..321930d7e6 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -5275,7 +5275,8 @@ class KimiLinearModel(TextModel): # Kimi specific bias if name.endswith("e_score_correction_bias"): - name = name.replace("e_score_correction_bias", "e_score_correction.bias") + new_name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_EXP_PROBS_B, bid) + return [(new_name, data_torch)] # process the experts separately if name.find("block_sparse_moe.experts") != -1: @@ -5305,7 +5306,27 @@ class KimiLinearModel(TextModel): tensors.append((new_name, data_torch)) return tensors return [] - + + # note: MLA with the absorption optimization, needs these two split and k_b_proj transposed + if name.endswith("kv_b_proj.weight"): + name_kb = name.replace("kv_b_proj", "k_b_proj") + name_vb = name.replace("kv_b_proj", "v_b_proj") + + n_head_kv = self.hparams["num_key_value_heads"] + v_head_dim = self.hparams["v_head_dim"] + qk_nope_head_dim = self.hparams["qk_nope_head_dim"] + + assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim) + + kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1]) + k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1) + k_b = k_b.transpose(1, 2) + + return [ + (self.map_tensor_name(name_kb), k_b), + (self.map_tensor_name(name_vb), v_b) + ] + mapped_name = self.map_tensor_name(name) logger.info(f"Returning {mapped_name}: shape after = {tuple(data_torch.shape)}") return [(mapped_name, data_torch)] diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 372489ca44..8d2b54d7d5 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -3317,6 +3317,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.ATTN_Q_B, MODEL_TENSOR.ATTN_KV_A_MQA, MODEL_TENSOR.ATTN_KV_B, + MODEL_TENSOR.ATTN_K_B, + MODEL_TENSOR.ATTN_V_B, MODEL_TENSOR.ATTN_Q_A_NORM, MODEL_TENSOR.ATTN_KV_A_NORM, MODEL_TENSOR.FFN_NORM, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index b0d4fb1cb1..486f6a5b1d 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -403,7 +403,7 @@ class TensorNameMap: "model.layers.{bid}.feed_forward.expert_bias", # lfm2moe "model.layers.{bid}.block_sparse_moe.e_score_correction", # minimax-m2 "backbone.layers.{bid}.mixer.gate.e_score_correction" # nemotron-h-moe - "model.layers.{bid}.block_sparse_moe.gate.e_score_correction", # kimi + "model.layers.{bid}.block_sparse_moe.gate.e_score_correction_bias", # kimi ), # Feed-forward up @@ -812,7 +812,7 @@ class TensorNameMap: ), MODEL_TENSOR.SSM_DT_B: ( "model.layers.{bid}.self_attn.dt_bias", - + ), MODEL_TENSOR.TIME_MIX_W0: ( "model.layers.{bid}.attention.w0", # rwkv7 ), From b9360c7fe194e8190e1ee8b9da258699d7666e17 Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Sun, 11 Jan 2026 15:58:46 +0800 Subject: [PATCH 32/58] MLA KV cache support --- convert_hf_to_gguf.py | 14 +++-- src/llama-arch.cpp | 2 + src/llama-model.cpp | 8 ++- src/models/kimi-linear.cpp | 112 ++++++++++++++++++++++++++----------- 4 files changed, 95 insertions(+), 41 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 321930d7e6..3f402a9acb 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -5118,6 +5118,9 @@ class KimiLinearModel(TextModel): raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!") def set_gguf_parameters(self): + # note: To enable MLA KV cache, attention needs to be converted into MQA (ie: GQA with 1 group) + self.hparams["num_key_value_heads"] = 1 + super().set_gguf_parameters() self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) @@ -5141,7 +5144,7 @@ class KimiLinearModel(TextModel): _full_attn_layers = linear_attn_config["full_attn_layers"] for il in range(self.hparams["num_hidden_layers"]): if il+1 in _full_attn_layers: - _num_kv_heads.append(linear_attn_config["num_heads"]) + _num_kv_heads.append(self.hparams["num_key_value_heads"]) else: _num_kv_heads.append(0) assert(len(_num_kv_heads) == self.hparams["num_hidden_layers"]) @@ -5156,8 +5159,6 @@ class KimiLinearModel(TextModel): if kda_head_dim is not None: self.gguf_writer.add_kda_head_dim(kda_head_dim) - # MLA params - use add_* methods that handle arch substitution - # MLA params - use add_* methods that handle arch substitution # Support both HuggingFace naming (q_lora_rank, kv_lora_rank) and internal naming (n_lora_q, n_lora_kv) q_lora_rank = self.hparams.get("q_lora_rank", self.hparams.get("n_lora_q")) @@ -5172,9 +5173,11 @@ class KimiLinearModel(TextModel): # Support HuggingFace naming: qk_nope_head_dim, qk_rope_head_dim, v_head_dim qk_nope_head_dim = self.hparams.get("qk_nope_head_dim") qk_rope_head_dim = self.hparams.get("qk_rope_head_dim") - self.gguf_writer.add_key_length(qk_nope_head_dim + qk_rope_head_dim) v_head_dim = self.hparams.get("v_head_dim") - self.gguf_writer.add_value_length(v_head_dim) + # To enable MLA KV cache, MLA needs to be converted into MQA with larger heads, then decompresses to MHA + self.gguf_writer.add_key_length(self.hparams["kv_lora_rank"] + self.hparams["qk_rope_head_dim"]) + self.gguf_writer.add_value_length(self.hparams["kv_lora_rank"]) + # Calculate n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim if "n_embd_head_k_mla" in self.hparams: @@ -5315,6 +5318,7 @@ class KimiLinearModel(TextModel): n_head_kv = self.hparams["num_key_value_heads"] v_head_dim = self.hparams["v_head_dim"] qk_nope_head_dim = self.hparams["qk_nope_head_dim"] + logger.info("Split kv_b n_head_kv %d\n" % n_head_kv) assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 076509ed8e..6baf3bd4da 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -2312,6 +2312,8 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_ATTN_Q_A_NORM, LLM_TENSOR_ATTN_KV_A_MQA, LLM_TENSOR_ATTN_KV_B, + LLM_TENSOR_ATTN_K_B, + LLM_TENSOR_ATTN_V_B, LLM_TENSOR_ATTN_KV_A_NORM, }; default: diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 59e8d49f08..712c341fd5 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -6771,8 +6771,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // Note: hparams.n_rot may be 72 (from conversion) but actual is 64 const int64_t qk_rope_head_dim = hparams.n_rot; // From config: qk_rope_head_dim layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + qk_rope_head_dim}, 0); - layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_k_mla - qk_rope_head_dim + n_embd_head_v_mla)}, 0); - + // Support Legacy GGUFs that don't split wkv_b (MLA KV cache disabled) + layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_k_mla - qk_rope_head_dim + n_embd_head_v_mla)}, TENSOR_NOT_REQUIRED); + if (!layer.wkv_b) { // MLA KV cache enabled + layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_k_mla - qk_rope_head_dim, kv_lora_rank, n_head}, 0); + layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0); + } layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0); } diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index c55116bc69..9d83ca8fa5 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -321,9 +321,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll // vLLM Kimi: q = q_proj(hidden_states), then view as [n_tokens, n_head, qk_head_dim] // Note: Kimi MLA does NOT use RoPE (rotary_emb=None in vLLM) ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.wq, cur); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k_mla, n_head, n_tokens); - cb(Qcur, "mla_Q", il); - + // Step 2: KV compression // kv_cmpr_pe = kv_a_proj_with_mqa(hidden_states) -> [kv_lora_rank + qk_rope_head_dim, n_tokens] ggml_tensor * kv_cmpr_pe = ggml_mul_mat(ctx0, layer.wkv_a_mqa, cur); @@ -341,37 +339,83 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll // Normalize kv_c kv_cmpr = build_norm(kv_cmpr, layer.attn_kv_a_norm, nullptr, LLM_NORM_RMS, il); - - // KV decompression: kv = kv_b_proj(kv_c_normed) - ggml_tensor * kv = ggml_mul_mat(ctx0, layer.wkv_b, kv_cmpr); - const int64_t kv_per_head = n_embd_head_qk_nope + n_embd_head_v_mla; - - // Split kv into k_nope and v - ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens, - ggml_row_size(kv->type, kv_per_head), - ggml_row_size(kv->type, kv_per_head * n_head), 0); - ggml_tensor * Vcur = ggml_view_3d(ctx0, kv, n_embd_head_v_mla, n_head, n_tokens, - ggml_row_size(kv->type, kv_per_head), - ggml_row_size(kv->type, kv_per_head * n_head), - ggml_row_size(kv->type, n_embd_head_qk_nope)); - k_nope = ggml_cont(ctx0, k_nope); - Vcur = ggml_cont(ctx0, Vcur); - cb(Vcur, "mla_V", il); - - // Concatenate k_nope + k_pe (broadcast k_pe to all heads) - // K = [k_nope, k_pe] where k_nope is [qk_nope_head_dim, n_head, n_tokens] - // and k_pe is [qk_rope_head_dim, 1, n_tokens] broadcast to all heads - // Need to broadcast k_pe from [qk_rope, 1, n_tokens] to [qk_rope, n_head, n_tokens] - ggml_tensor * k_pe_target = ggml_new_tensor_3d(ctx0, k_pe->type, n_embd_head_qk_rope, n_head, n_tokens); - ggml_tensor * k_pe_repeated = ggml_repeat(ctx0, k_pe, k_pe_target); - ggml_tensor * Kcur = ggml_concat(ctx0, k_nope, k_pe_repeated, 0); - cb(Kcur, "mla_K", il); - - // Direct softmax attention (with KV cache) - // Use build_attn with inp_attn for proper mask handling - cur = build_attn(inp_attn, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il); - cb(cur, "mla_out", il); - + + if (layer.wk_b && layer.wv_b) { // MLA KV cache enabled + // extract q_nope + ggml_tensor * q_nope = + ggml_view_3d(ctx0, Qcur, n_embd_head_qk_nope, n_head, n_tokens, ggml_row_size(Qcur->type, n_embd_head_k_mla), + ggml_row_size(Qcur->type, n_embd_head_k_mla) * n_head, 0); + cb(q_nope, "q_nope", il); + + // and {n_embd_head_qk_rope, n_head, n_tokens} + ggml_tensor * q_pe = ggml_view_3d( + ctx0, Qcur, n_embd_head_qk_rope, n_head, n_tokens, ggml_row_size(Qcur->type, n_embd_head_k_mla), + ggml_row_size(Qcur->type, n_embd_head_k_mla) * n_head, ggml_row_size(Qcur->type, n_embd_head_qk_nope)); + cb(q_pe, "q_pe", il); + + // {n_embd_head_qk_nope, n_tokens, n_head} + q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3); + cb(q_nope, "q_nope_perm", il); + + // {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head} + ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, layer.wk_b, q_nope); + cb(q_nope_absorbed, "q_nope_absorbed", il); + + // {kv_lora_rank, n_head, n_tokens} + q_nope_absorbed = ggml_permute(ctx0, q_nope_absorbed, 0, 2, 1, 3); + cb(q_nope_absorbed, "q_nope_absorbed_perm", il); + + // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens} + // note: rope must go first for in-place context shifting in build_rope_shift() + Qcur = ggml_concat(ctx0, q_pe, q_nope_absorbed, 0); + cb(Qcur, "Qcur", il); + + kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens); + cb(kv_cmpr, "kv_cmpr_reshape", il); + + // {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens} + ggml_tensor * Kcur = ggml_concat(ctx0, k_pe, kv_cmpr, 0); + cb(Kcur, "Kcur", il); + + // {kv_lora_rank, 1, n_tokens} + ggml_tensor * Vcur = kv_cmpr; + cb(Vcur, "Vcur", il); + + cur = build_attn(inp_attn, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, layer.wv_b, kq_scale_mla, il); + cb(cur, "mla_out", il); + } else { // MLA KV cache disabled. Fall back to MHA KV cache. + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k_mla, n_head, n_tokens); + cb(Qcur, "mla_Q", il); + // KV decompression: kv = kv_b_proj(kv_c_normed) + ggml_tensor * kv = ggml_mul_mat(ctx0, layer.wkv_b, kv_cmpr); + const int64_t kv_per_head = n_embd_head_qk_nope + n_embd_head_v_mla; + + // Split kv into k_nope and v + ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens, + ggml_row_size(kv->type, kv_per_head), + ggml_row_size(kv->type, kv_per_head * n_head), 0); + ggml_tensor * Vcur = ggml_view_3d(ctx0, kv, n_embd_head_v_mla, n_head, n_tokens, + ggml_row_size(kv->type, kv_per_head), + ggml_row_size(kv->type, kv_per_head * n_head), + ggml_row_size(kv->type, n_embd_head_qk_nope)); + k_nope = ggml_cont(ctx0, k_nope); + Vcur = ggml_cont(ctx0, Vcur); + cb(Vcur, "mla_V", il); + + // Concatenate k_nope + k_pe (broadcast k_pe to all heads) + // K = [k_nope, k_pe] where k_nope is [qk_nope_head_dim, n_head, n_tokens] + // and k_pe is [qk_rope_head_dim, 1, n_tokens] broadcast to all heads + // Need to broadcast k_pe from [qk_rope, 1, n_tokens] to [qk_rope, n_head, n_tokens] + ggml_tensor * k_pe_target = ggml_new_tensor_3d(ctx0, k_pe->type, n_embd_head_qk_rope, n_head, n_tokens); + ggml_tensor * k_pe_repeated = ggml_repeat(ctx0, k_pe, k_pe_target); + ggml_tensor * Kcur = ggml_concat(ctx0, k_nope, k_pe_repeated, 0); + cb(Kcur, "mla_K", il); + + // Direct softmax attention (with MHA KV cache) + // Use build_attn with inp_attn for proper mask handling + cur = build_attn(inp_attn, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il); + cb(cur, "mla_out", il); + } } else { // Unknown layer type - this should not happen GGML_ABORT("Kimi layer is neither KDA nor MLA - missing required tensors"); From 6ae66fc40dcbd9562ef71ebe5cd3a7bc9686e385 Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Sun, 11 Jan 2026 21:31:35 +0800 Subject: [PATCH 33/58] fix trailing spaces --- gguf-py/gguf/tensor_mapping.py | 4 +- src/llama-vocab.cpp | 4 +- src/models/kimi-linear.cpp | 82 ++++++++++++++++------------------ 3 files changed, 42 insertions(+), 48 deletions(-) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 88e2caf541..c4957a7b20 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -819,13 +819,13 @@ class TensorNameMap: # Kimi Linear KDA (using SSM_ prefix for consistency) MODEL_TENSOR.SSM_CONV1D_Q: ( "model.layers.{bid}.self_attn.q_conv1d", - ), + ), MODEL_TENSOR.SSM_CONV1D_K: ( "model.layers.{bid}.self_attn.k_conv1d", ), MODEL_TENSOR.SSM_CONV1D_V: ( "model.layers.{bid}.self_attn.v_conv1d", - ), + ), MODEL_TENSOR.SSM_F_A: ( "model.layers.{bid}.self_attn.f_a_proj", ), diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index eaa574f3b8..f7a264dc60 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1747,7 +1747,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str()); // Kimi-K2 uses custom tokenization without traditional BPE merges const bool is_kimi_k2 = (tokenizer_pre == "kimi-k2"); - + if (merges_keyidx == -1) { if (!is_kimi_k2) { throw std::runtime_error("cannot find tokenizer merges in model file\n"); @@ -1768,7 +1768,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { if (pos != std::string::npos) { first = word.substr(0, pos); second = word.substr(pos + 1); - } + } bpe_ranks.emplace(std::make_pair(first, second), i); } diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index 9d83ca8fa5..e873024c90 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -12,7 +12,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll // Note: Kimi MLA does NOT use RoPE (rotary_emb=None in vLLM) // So we don't need inp_pos - + auto * inp = build_inp_mem_hybrid(); auto * inp_rs = inp->get_recr(); auto * inp_attn = inp->get_attn(); @@ -38,12 +38,12 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll const int64_t d_inner = n_head * head_dim; // 32 * 128 = 4096 const int64_t n_seqs = ubatch.n_seqs; const int64_t n_seq_tokens = ubatch.n_seq_tokens; - + // Verify batch consistency for recurrent layers GGML_ASSERT(n_seqs != 0); GGML_ASSERT(ubatch.equal_seqs()); GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); - + // MLA params const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla; const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla; @@ -67,14 +67,13 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll // KDA layers have ssm_a_log tensor, MLA layers have wkv_a_mqa tensor bool is_kda = (layer.ssm_a_log != nullptr); bool is_mla = (layer.wkv_a_mqa != nullptr); - + if (is_kda) { // === KDA Layer (Kimi Delta Attention) with Recurrent State === // Reference: vLLM kda.py - const auto * mctx_cur = inp_rs->mctx; const auto kv_head = mctx_cur->get_head(); - + // Get conv states from r_l tensor (Q, K, V each have separate state) ggml_tensor * conv_states_all = mctx_cur->get_r_l(il); cb(conv_states_all, "conv_states_all", il); @@ -85,7 +84,6 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll // Each conv state is [(d_conv-1) * d_inner] per sequence, need to reshape to [d_conv-1, d_inner, n_seqs] // Memory layout: for each seq, Q state is first conv_state_size elements, then K, then V // conv_state_all has stride: nb[0] = element_size, nb[1] = n_embd_r_total * element_size - // View Q conv state: offset 0, size conv_state_size per seq // conv_state_all is [n_embd_r_total, n_seqs] with memory layout: // state[i + seq * n_embd_r_total] where i = conv_step + channel * (d_conv-1) + {0, conv_state_size, 2*conv_state_size} for Q/K/V @@ -104,7 +102,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll (d_conv - 1) * ggml_element_size(conv_state_all), n_embd_r_total * ggml_element_size(conv_state_all), 2 * conv_state_size * ggml_element_size(conv_state_all)); // offset for V - + // Step 1: Q, K, V projections -> [d_inner, n_tokens] ggml_tensor * q_proj = ggml_mul_mat(ctx0, layer.wq, cur); ggml_tensor * k_proj = ggml_mul_mat(ctx0, layer.wk, cur); @@ -112,14 +110,14 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll cb(q_proj, "kda_q_proj", il); cb(k_proj, "kda_k_proj", il); cb(v_proj, "kda_v_proj", il); - + // Step 2: Causal Conv1d for Q // Reshape input: {d_inner, n_tokens} -> {d_inner, n_seq_tokens, n_seqs} ggml_tensor * q_3d = ggml_reshape_3d(ctx0, q_proj, d_inner, n_seq_tokens, n_seqs); - + // Concat Q conv state and current input: {d_conv-1 + n_seq_tokens, d_inner, n_seqs} ggml_tensor * conv_q = ggml_concat(ctx0, conv_state_q, ggml_transpose(ctx0, q_3d), 0); - + // Save last (d_conv-1) columns back to Q conv state ggml_tensor * last_conv_q = ggml_view_3d(ctx0, conv_q, d_conv - 1, d_inner, n_seqs, conv_q->nb[1], conv_q->nb[2], n_seq_tokens * conv_q->nb[0]); @@ -127,7 +125,6 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll ggml_cpy(ctx0, last_conv_q, ggml_view_1d(ctx0, conv_states_all, conv_state_size * n_seqs, kv_head * n_embd_r_total * ggml_element_size(conv_states_all)))); - // Reshape conv weight: GGUF [d_conv, 1, d_inner, 1] -> ggml_ssm_conv expects [d_conv, d_inner] // GGUF stores as [d_conv, 1, d_inner, 1] with memory layout w[conv_step + channel * d_conv] // vLLM stores as [d_inner, d_conv] with memory layout w[channel * d_conv + conv_step] @@ -143,13 +140,13 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll } conv_weight = ggml_reshape_2d(ctx0, q_conv_f32, d_conv, d_inner); } - + // Apply conv1d ggml_tensor * Qcur; if (conv_weight) { // Make conv_q contiguous for ggml_ssm_conv conv_q = ggml_cont(ctx0, conv_q); - + // ggml_ssm_conv output: {d_inner, n_seq_tokens, n_seqs} Qcur = ggml_ssm_conv(ctx0, conv_q, conv_weight); cb(Qcur, "Q conv1d", il); @@ -163,13 +160,13 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll } else { GGML_ABORT("KDA layer missing Q conv weight"); } - + // K conv1d (with separate K conv state) ggml_tensor * Kcur; if (layer.ssm_k_conv) { ggml_tensor * k_3d = ggml_reshape_3d(ctx0, k_proj, d_inner, n_seq_tokens, n_seqs); ggml_tensor * conv_k = ggml_cont(ctx0, ggml_concat(ctx0, conv_state_k, ggml_transpose(ctx0, k_3d), 0)); - + // Save K conv state ggml_tensor * last_conv_k = ggml_view_3d(ctx0, conv_k, d_conv - 1, d_inner, n_seqs, conv_k->nb[1], conv_k->nb[2], n_seq_tokens * conv_k->nb[0]); @@ -177,7 +174,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll ggml_cpy(ctx0, last_conv_k, ggml_view_1d(ctx0, conv_states_all, conv_state_size * n_seqs, (kv_head * n_embd_r_total + conv_state_size) * ggml_element_size(conv_states_all)))); - + ggml_tensor * k_conv_f32 = layer.ssm_k_conv; if (k_conv_f32->type != GGML_TYPE_F32) { k_conv_f32 = ggml_cast(ctx0, k_conv_f32, GGML_TYPE_F32); @@ -194,13 +191,13 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll } else { GGML_ABORT("KDA layer missing K conv weight"); } - + // V conv1d (with separate V conv state) ggml_tensor * Vcur; if (layer.ssm_v_conv) { ggml_tensor * v_3d = ggml_reshape_3d(ctx0, v_proj, d_inner, n_seq_tokens, n_seqs); ggml_tensor * conv_v = ggml_cont(ctx0, ggml_concat(ctx0, conv_state_v, ggml_transpose(ctx0, v_3d), 0)); - + // Save V conv state ggml_tensor * last_conv_v = ggml_view_3d(ctx0, conv_v, d_conv - 1, d_inner, n_seqs, conv_v->nb[1], conv_v->nb[2], n_seq_tokens * conv_v->nb[0]); @@ -208,7 +205,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll ggml_cpy(ctx0, last_conv_v, ggml_view_1d(ctx0, conv_states_all, conv_state_size * n_seqs, (kv_head * n_embd_r_total + 2 * conv_state_size) * ggml_element_size(conv_states_all)))); - + ggml_tensor * v_conv_f32 = layer.ssm_v_conv; if (v_conv_f32->type != GGML_TYPE_F32) { v_conv_f32 = ggml_cast(ctx0, v_conv_f32, GGML_TYPE_F32); @@ -225,7 +222,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll } else { GGML_ABORT("KDA layer missing V conv weight"); } - + // Step 3: Compute g1 (forget gate) // g1 = -exp(A_log) * softplus(f_b(f_a(x)) + dt_bias) ggml_tensor * f_a = ggml_mul_mat(ctx0, layer.ssm_f_a, cur); @@ -234,7 +231,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll g1 = ggml_add(ctx0, g1, layer.ssm_dt_b); g1 = ggml_softplus(ctx0, g1); g1 = ggml_reshape_3d(ctx0, g1, head_dim, n_head, n_tokens); - + // A_log shape is [1, n_head] or [1, n_head, 1, 1], need to broadcast to [head_dim, n_head, n_tokens] // First compute -exp(A_log), then reshape for broadcasting ggml_tensor * A_neg_exp = ggml_neg(ctx0, ggml_exp(ctx0, layer.ssm_a_log)); @@ -242,16 +239,16 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll A_neg_exp = ggml_reshape_3d(ctx0, A_neg_exp, 1, n_head, 1); g1 = ggml_mul(ctx0, g1, A_neg_exp); cb(g1, "kda_g1", il); - + // Step 4: Compute beta (mixing coefficient) ggml_tensor * beta = ggml_mul_mat(ctx0, layer.ssm_beta, cur); beta = ggml_cont_4d(ctx0, beta, n_head, 1, n_seq_tokens, n_seqs); cb(beta, "kda_beta", il); - + // Step 5: Reshape for KDA recurrence // {n_embd, n_tokens} -> {n_embd, n_seq_tokens, n_seqs} cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs); - + Qcur = ggml_cont(ctx0, ggml_reshape_4d(ctx0, Qcur, head_dim, n_head, n_seq_tokens, n_seqs)); Kcur = ggml_cont(ctx0, ggml_reshape_4d(ctx0, Kcur, head_dim, n_head, n_seq_tokens, n_seqs)); Vcur = ggml_cont(ctx0, ggml_reshape_4d(ctx0, Vcur, head_dim, n_head, n_seq_tokens, n_seqs)); @@ -274,7 +271,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll const int64_t output_flat_size = head_dim * n_head * n_seq_tokens * n_seqs; ggml_tensor * attn_out_1d = ggml_view_1d(ctx0, attn_out, output_flat_size, 0); cb(attn_out_1d, "attn_out_1d", il); - + ggml_tensor * attn_out_final = ggml_reshape_3d(ctx0, attn_out_1d, head_dim, n_head, n_seq_tokens * n_seqs); cb(attn_out_final, "attn_out_reshaped", il); // Extract the state part (second part of the concatenated tensor) @@ -299,7 +296,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll ggml_tensor * g2 = ggml_mul_mat(ctx0, layer.ssm_g_b, g_a); cb(g2, "g2 g_b(g_a(cur_2d))", il); g2 = ggml_reshape_3d(ctx0, g2, head_dim, n_head, n_seq_tokens * n_seqs); - + // Step 8: Apply o_norm with sigmoid gating // Note: Kimi model uses sigmoid gating, not SiLU (despite FusedRMSNormGated default being swish) // Formula: output = RMSNorm(x) * sigmoid(g) @@ -307,7 +304,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll cb(normed, "kda_normed", il); ggml_tensor * gate = ggml_sigmoid(ctx0, g2); ggml_tensor * gated = ggml_mul(ctx0, normed, gate); - + // Step 9: Output projection gated = ggml_cont_2d(ctx0, gated, d_inner, n_tokens); cur = ggml_mul_mat(ctx0, layer.wo, gated); @@ -316,7 +313,6 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll } else if (is_mla) { // === MLA Layer (Multi-head Latent Attention) without KV Cache === // Reference: vLLM mla.py - // Step 1: Q projection and reshape // vLLM Kimi: q = q_proj(hidden_states), then view as [n_tokens, n_head, qk_head_dim] // Note: Kimi MLA does NOT use RoPE (rotary_emb=None in vLLM) @@ -325,7 +321,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll // Step 2: KV compression // kv_cmpr_pe = kv_a_proj_with_mqa(hidden_states) -> [kv_lora_rank + qk_rope_head_dim, n_tokens] ggml_tensor * kv_cmpr_pe = ggml_mul_mat(ctx0, layer.wkv_a_mqa, cur); - + // Split: kv_cmpr = kv_lora[:kv_lora_rank], k_pe = kv_lora[kv_lora_rank:] ggml_tensor * kv_cmpr = ggml_view_2d(ctx0, kv_cmpr_pe, kv_lora_rank, n_tokens, ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope), 0); @@ -333,10 +329,8 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope), ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope), ggml_row_size(kv_cmpr_pe->type, kv_lora_rank)); - // Note: Kimi MLA does NOT apply RoPE (rotary_emb=None in vLLM) // k_pe is used directly without RoPE - // Normalize kv_c kv_cmpr = build_norm(kv_cmpr, layer.attn_kv_a_norm, nullptr, LLM_NORM_RMS, il); @@ -346,7 +340,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll ggml_view_3d(ctx0, Qcur, n_embd_head_qk_nope, n_head, n_tokens, ggml_row_size(Qcur->type, n_embd_head_k_mla), ggml_row_size(Qcur->type, n_embd_head_k_mla) * n_head, 0); cb(q_nope, "q_nope", il); - + // and {n_embd_head_qk_rope, n_head, n_tokens} ggml_tensor * q_pe = ggml_view_3d( ctx0, Qcur, n_embd_head_qk_rope, n_head, n_tokens, ggml_row_size(Qcur->type, n_embd_head_k_mla), @@ -389,7 +383,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll // KV decompression: kv = kv_b_proj(kv_c_normed) ggml_tensor * kv = ggml_mul_mat(ctx0, layer.wkv_b, kv_cmpr); const int64_t kv_per_head = n_embd_head_qk_nope + n_embd_head_v_mla; - + // Split kv into k_nope and v ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens, ggml_row_size(kv->type, kv_per_head), @@ -401,7 +395,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll k_nope = ggml_cont(ctx0, k_nope); Vcur = ggml_cont(ctx0, Vcur); cb(Vcur, "mla_V", il); - + // Concatenate k_nope + k_pe (broadcast k_pe to all heads) // K = [k_nope, k_pe] where k_nope is [qk_nope_head_dim, n_head, n_tokens] // and k_pe is [qk_rope_head_dim, 1, n_tokens] broadcast to all heads @@ -410,7 +404,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll ggml_tensor * k_pe_repeated = ggml_repeat(ctx0, k_pe, k_pe_target); ggml_tensor * Kcur = ggml_concat(ctx0, k_nope, k_pe_repeated, 0); cb(Kcur, "mla_K", il); - + // Direct softmax attention (with MHA KV cache) // Use build_attn with inp_attn for proper mask handling cur = build_attn(inp_attn, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il); @@ -420,13 +414,13 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll // Unknown layer type - this should not happen GGML_ABORT("Kimi layer is neither KDA nor MLA - missing required tensors"); } - + // On last layer, select only the output tokens if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } - + // Residual ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); @@ -459,7 +453,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll (llama_expert_gating_func_type) hparams.expert_gating_func, il); cb(moe_out, "ffn_moe_out", il); - + // Shared expert { ggml_tensor * ffn_shexp = build_ffn(cur, @@ -468,7 +462,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll layer.ffn_down_shexp, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(ffn_shexp, "ffn_shexp", il); - + cur = ggml_add(ctx0, moe_out, ffn_shexp); cb(cur, "ffn_out", il); } @@ -663,7 +657,7 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking( Aqk = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_4d(ctx0, Aqk, chunk_size, chunk_size, n_chunks, HB))); cb(Akk, "Akk", il); cb(Aqk, "Aqk", il); - + Akk = ggml_mul(ctx0, Akk, beta); Akk = ggml_neg(ctx0, ggml_mul(ctx0, Akk, causal_mask)); cb(Akk, "attn_pre_solve", il); @@ -798,15 +792,15 @@ ggml_tensor * llm_build_kimi_linear::build_kda_autoregressive( ggml_tensor * v, ggml_tensor * gk, ggml_tensor * beta, - ggml_tensor * state, + ggml_tensor * state, int il) { GGML_ASSERT(ggml_is_contiguous(q)); GGML_ASSERT(ggml_is_contiguous(k)); - GGML_ASSERT(ggml_is_contiguous(v)); + GGML_ASSERT(ggml_is_contiguous(v)); GGML_ASSERT(ggml_is_contiguous(gk)); GGML_ASSERT(ggml_is_contiguous(beta)); GGML_ASSERT(ggml_is_contiguous(state)); - + const int64_t S_k = q->ne[0]; const int64_t H_k = q->ne[1]; const int64_t n_tokens = q->ne[2]; From 93afbedc96ae17a5a68eef5c1f7202f0e9c2c949 Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Sun, 11 Jan 2026 21:44:54 +0800 Subject: [PATCH 34/58] moved const llama_model & model; around to follow qwen3next format and see if it cna pass the -Wunused-private-field error --- src/models/kimi-linear.cpp | 3 ++- src/models/models.h | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index e873024c90..62f83e3ea5 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -3,7 +3,8 @@ #define CHUNK_SIZE 64 -llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params), model(model) { +llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const llm_graph_params & params) : + llm_graph_context_mamba(params), model(model) { ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/models.h b/src/models/models.h index 3ed00aae32..549329e15a 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -287,7 +287,6 @@ struct llm_build_jamba : public llm_graph_context_mamba { struct llm_build_kimi_linear : public llm_graph_context_mamba { llm_build_kimi_linear(const llama_model & model, const llm_graph_params & params); private: - const llama_model & model; ggml_tensor * build_kda_autoregressive( ggml_tensor * q, ggml_tensor * k, @@ -308,6 +307,8 @@ private: ggml_tensor * identity, ggml_tensor * diag_mask, int il); + + const llama_model & model; }; struct llm_build_lfm2 : public llm_graph_context { From 59182f5e06e4a8c394c7b62805679e66ee2de3fc Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Sun, 11 Jan 2026 22:06:48 +0800 Subject: [PATCH 35/58] fix trailing whitespace --- src/models/kimi-linear.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index 62f83e3ea5..b0330e23b3 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -410,7 +410,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll // Use build_attn with inp_attn for proper mask handling cur = build_attn(inp_attn, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il); cb(cur, "mla_out", il); - } + } } else { // Unknown layer type - this should not happen GGML_ABORT("Kimi layer is neither KDA nor MLA - missing required tensors"); @@ -628,7 +628,7 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking( A[..., j] = torch.einsum('... c d, ... d -> ... c', q_i * (g_i - g_j).exp(), k_j) */ const int64_t CHB = n_chunks * H_k * n_seqs; - ggml_tensor * gkcs_i = ggml_reshape_4d(ctx0, gk_cumsum, chunk_size, 1, S_k, CHB); // [chunk_size, 1, S_k, CHB] + ggml_tensor * gkcs_i = ggml_reshape_4d(ctx0, gk_cumsum, chunk_size, 1, S_k, CHB); // [chunk_size, 1, S_k, CHB] ggml_tensor * gkcs_j = ggml_reshape_4d(ctx0, gkcs_i, 1, chunk_size, S_k, CHB); // [1, chunk_size, S_k, CHB] ggml_tensor * gkcs_j_bc = ggml_repeat_4d(ctx0, gkcs_j, chunk_size, chunk_size, S_k, CHB); // [1, chunk_size, S_k, CHB] -> [chunk_size, chunk_size, S_k, CHB] @@ -787,7 +787,7 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking( return ggml_concat(ctx0, flat_output, flat_state, 0); } -ggml_tensor * llm_build_kimi_linear::build_kda_autoregressive( +ggml_tensor * llm_build_kimi_linear::build_kda_autoregressive( ggml_tensor * q, ggml_tensor * k, ggml_tensor * v, From 58d1ee52276bd51f9fe3ec0edcce1d5403910f9f Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Sun, 11 Jan 2026 22:19:29 +0800 Subject: [PATCH 36/58] removed traling whitespaces in empty line + make sure indentation is multiple of 4 --- convert_hf_to_gguf.py | 61 +++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 31 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 9272cc28a6..e99deeeb44 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -5138,7 +5138,7 @@ class KimiLinearModel(TextModel): # Default to 4096 if not found logger.warning("No context length found in config, defaulting to 4096") self.gguf_writer.add_context_length(4096) - + # KDA & MLA params # Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv linear_attn_config = self.hparams.get("linear_attn_config", {}) @@ -5156,23 +5156,23 @@ class KimiLinearModel(TextModel): ssm_d_conv = self.hparams.get("ssm_d_conv") or linear_attn_config.get("short_conv_kernel_size") if ssm_d_conv is not None: - self.gguf_writer.add_ssm_conv_kernel(ssm_d_conv) + self.gguf_writer.add_ssm_conv_kernel(ssm_d_conv) kda_head_dim = self.hparams.get("kda_head_dim") or linear_attn_config.get("head_dim") if kda_head_dim is not None: - self.gguf_writer.add_kda_head_dim(kda_head_dim) - + self.gguf_writer.add_kda_head_dim(kda_head_dim) + # MLA params - use add_* methods that handle arch substitution # Support both HuggingFace naming (q_lora_rank, kv_lora_rank) and internal naming (n_lora_q, n_lora_kv) q_lora_rank = self.hparams.get("q_lora_rank", self.hparams.get("n_lora_q")) kv_lora_rank = self.hparams.get("kv_lora_rank", self.hparams.get("n_lora_kv")) - + if q_lora_rank is not None: - self.gguf_writer.add_q_lora_rank(q_lora_rank) + self.gguf_writer.add_q_lora_rank(q_lora_rank) if kv_lora_rank is not None: - self.gguf_writer.add_kv_lora_rank(kv_lora_rank) - + self.gguf_writer.add_kv_lora_rank(kv_lora_rank) + # MLA head dimensions # Support HuggingFace naming: qk_nope_head_dim, qk_rope_head_dim, v_head_dim qk_nope_head_dim = self.hparams.get("qk_nope_head_dim") @@ -5182,28 +5182,27 @@ class KimiLinearModel(TextModel): self.gguf_writer.add_key_length(self.hparams["kv_lora_rank"] + self.hparams["qk_rope_head_dim"]) self.gguf_writer.add_value_length(self.hparams["kv_lora_rank"]) - # Calculate n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim if "n_embd_head_k_mla" in self.hparams: - self.gguf_writer.add_key_length_mla(self.hparams["n_embd_head_k_mla"]) + self.gguf_writer.add_key_length_mla(self.hparams["n_embd_head_k_mla"]) elif qk_nope_head_dim is not None and qk_rope_head_dim is not None: - n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim - self.gguf_writer.add_key_length_mla(n_embd_head_k_mla) - + n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim + self.gguf_writer.add_key_length_mla(n_embd_head_k_mla) + # n_embd_head_v_mla = v_head_dim if "n_embd_head_v_mla" in self.hparams: - self.gguf_writer.add_value_length_mla(self.hparams["n_embd_head_v_mla"]) + self.gguf_writer.add_value_length_mla(self.hparams["n_embd_head_v_mla"]) elif v_head_dim is not None: - self.gguf_writer.add_value_length_mla(v_head_dim) - + self.gguf_writer.add_value_length_mla(v_head_dim) + # Rotation - use qk_rope_head_dim for Kimi rope_dim = self.hparams.get("qk_rope_head_dim") or self.hparams.get("n_rot") if rope_dim is not None: - self.gguf_writer.add_rope_dimension_count(rope_dim) + self.gguf_writer.add_rope_dimension_count(rope_dim) else: - # Default to head_dim - head_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count(head_dim) + # Default to head_dim + head_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(head_dim) # Copied from Qwen2Moe as this model inherits parts of it # YaRN is not enabled by default @@ -5227,17 +5226,17 @@ class KimiLinearModel(TextModel): moe_intermediate_size = self.hparams.get("moe_intermediate_size") if moe_intermediate_size is not None: self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) - + # num_shared_experts (1 for Kimi) num_shared_experts = self.hparams.get("num_shared_experts") if num_shared_experts is not None: self.gguf_writer.add_expert_shared_count(num_shared_experts) - + # first_k_dense_replace (1 for Kimi - first layer uses dense MLP) first_k_dense_replace = self.hparams.get("first_k_dense_replace") if first_k_dense_replace is not None: self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace) - + # Routed scaling factor (expert_weights_scale = 2.446 for Kimi) routed_scaling_factor = self.hparams.get("routed_scaling_factor") if routed_scaling_factor is not None: @@ -5246,13 +5245,13 @@ class KimiLinearModel(TextModel): def prepare_tensors(self): super().prepare_tensors() if self._experts is not None: - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: logger.info(f"Processing {name}: shape before = {tuple(data_torch.shape)}") - + # Handle KDA conv1d weights # HuggingFace/vLLM stores as [d_inner, d_conv] (2D), memory layout: conv_step changes fastest # llama.cpp expects ggml ne = [d_conv, 1, d_inner, 1], memory layout: ne[0]=d_conv changes fastest @@ -5271,7 +5270,7 @@ class KimiLinearModel(TextModel): d_inner, _, d_conv = data_torch.shape data_torch = data_torch.reshape(1, d_inner, 1, d_conv) logger.info(f"Reshaped conv1d weight {name}: [d_inner={d_inner}, 1, d_conv={d_conv}] -> numpy {tuple(data_torch.shape)} -> ggml ne=[{d_conv}, 1, {d_inner}, 1]") - + # Handle A_log: HF stores as [1, 1, num_heads, 1] # llama.cpp expects ggml ne = [1, num_heads, 1, 1] # GGUF reverses numpy shape: numpy (1, 1, num_heads, 1) -> ggml ne = [1, num_heads, 1, 1] @@ -5279,11 +5278,11 @@ class KimiLinearModel(TextModel): if name.endswith(".A_log"): if data_torch.ndim == 4: logger.info(f"A_log {name}: numpy {tuple(data_torch.shape)} -> ggml ne={list(reversed(data_torch.shape))}") - + # Kimi specific bias if name.endswith("e_score_correction_bias"): - new_name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_EXP_PROBS_B, bid) - return [(new_name, data_torch)] + new_name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_EXP_PROBS_B, bid) + return [(new_name, data_torch)] # process the experts separately if name.find("block_sparse_moe.experts") != -1: From 4f6ef2c0858e545ffc81375a1d857c09ad19fa65 Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Sun, 11 Jan 2026 22:33:58 +0800 Subject: [PATCH 37/58] try to make lint happy --- convert_hf_to_gguf.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index e99deeeb44..a6d2810359 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -5159,7 +5159,6 @@ class KimiLinearModel(TextModel): self.gguf_writer.add_ssm_conv_kernel(ssm_d_conv) kda_head_dim = self.hparams.get("kda_head_dim") or linear_attn_config.get("head_dim") - if kda_head_dim is not None: self.gguf_writer.add_kda_head_dim(kda_head_dim) @@ -5328,11 +5327,7 @@ class KimiLinearModel(TextModel): kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1]) k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1) k_b = k_b.transpose(1, 2) - - return [ - (self.map_tensor_name(name_kb), k_b), - (self.map_tensor_name(name_vb), v_b) - ] + return [(self.map_tensor_name(name_kb), k_b), (self.map_tensor_name(name_vb), v_b)] mapped_name = self.map_tensor_name(name) logger.info(f"Returning {mapped_name}: shape after = {tuple(data_torch.shape)}") From 719d374bf666e43d70caf75f0a9e88dfa8a270d3 Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Sun, 11 Jan 2026 22:58:44 +0800 Subject: [PATCH 38/58] remove blank lines to make lint happy --- convert_hf_to_gguf.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index a6d2810359..cc808aa0d0 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -5088,7 +5088,6 @@ class KimiLinearModel(TextModel): merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) if len(merged) == 2: merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) - # Build token list vocab_size = self.hparams["vocab_size"] special_tokens = tokenizer.special_tokens @@ -5316,14 +5315,11 @@ class KimiLinearModel(TextModel): if name.endswith("kv_b_proj.weight"): name_kb = name.replace("kv_b_proj", "k_b_proj") name_vb = name.replace("kv_b_proj", "v_b_proj") - n_head_kv = self.hparams["num_key_value_heads"] v_head_dim = self.hparams["v_head_dim"] qk_nope_head_dim = self.hparams["qk_nope_head_dim"] logger.info("Split kv_b n_head_kv %d\n" % n_head_kv) - assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim) - kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1]) k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1) k_b = k_b.transpose(1, 2) From ac85cb137563289ca38718c8a78d59891dc24b5f Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Mon, 12 Jan 2026 08:14:51 +0800 Subject: [PATCH 39/58] removed at least blank line containing white space --- convert_hf_to_gguf.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index cc808aa0d0..312256e756 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -5062,7 +5062,7 @@ class CodeShellModel(TextModel): class KimiLinearModel(TextModel): """Kimi-Linear model with hybrid MLA+KDA architecture""" model_arch = gguf.MODEL_ARCH.KIMI_LINEAR - + _experts: list[dict[str, Tensor]] | None = None def set_vocab(self): @@ -5127,7 +5127,7 @@ class KimiLinearModel(TextModel): super().set_gguf_parameters() self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) - + # Use find_hparam for context length # Kimi uses model_max_length n_ctx = self.find_hparam(["max_position_embeddings", "model_max_length", "n_ctx", "n_positions"], optional=True) @@ -5156,7 +5156,6 @@ class KimiLinearModel(TextModel): ssm_d_conv = self.hparams.get("ssm_d_conv") or linear_attn_config.get("short_conv_kernel_size") if ssm_d_conv is not None: self.gguf_writer.add_ssm_conv_kernel(ssm_d_conv) - kda_head_dim = self.hparams.get("kda_head_dim") or linear_attn_config.get("head_dim") if kda_head_dim is not None: self.gguf_writer.add_kda_head_dim(kda_head_dim) @@ -5296,8 +5295,8 @@ class KimiLinearModel(TextModel): # merge the experts into a single 3d tensor tensors = [] # w1: gate, w2: down, w3: up - for wid, tname in [("w1", gguf.MODEL_TENSOR.FFN_GATE_EXP), - ("w2", gguf.MODEL_TENSOR.FFN_DOWN_EXP), + for wid, tname in [("w1", gguf.MODEL_TENSOR.FFN_GATE_EXP), + ("w2", gguf.MODEL_TENSOR.FFN_DOWN_EXP), ("w3", gguf.MODEL_TENSOR.FFN_UP_EXP)]: datas: list[Tensor] = [] for xid in range(n_experts): From 4faf26c376b7694f083523edd4f50923881102cf Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Mon, 12 Jan 2026 08:26:47 +0800 Subject: [PATCH 40/58] fixed flake8 complaints locally --- convert_hf_to_gguf.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 312256e756..1be740dba6 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -5146,11 +5146,11 @@ class KimiLinearModel(TextModel): _num_kv_heads = list() _full_attn_layers = linear_attn_config["full_attn_layers"] for il in range(self.hparams["num_hidden_layers"]): - if il+1 in _full_attn_layers: + if il + 1 in _full_attn_layers: _num_kv_heads.append(self.hparams["num_key_value_heads"]) else: _num_kv_heads.append(0) - assert(len(_num_kv_heads) == self.hparams["num_hidden_layers"]) + assert len(_num_kv_heads) == self.hparams["num_hidden_layers"] self.gguf_writer.add_head_count_kv(_num_kv_heads) ssm_d_conv = self.hparams.get("ssm_d_conv") or linear_attn_config.get("short_conv_kernel_size") @@ -5328,6 +5328,7 @@ class KimiLinearModel(TextModel): logger.info(f"Returning {mapped_name}: shape after = {tuple(data_torch.shape)}") return [(mapped_name, data_torch)] + @ModelBase.register("InternLM2ForCausalLM") class InternLM2Model(TextModel): model_arch = gguf.MODEL_ARCH.INTERNLM2 From 22bc582a82f2419bf446c2984ee1b12d4681ce4b Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Mon, 12 Jan 2026 20:32:19 +0800 Subject: [PATCH 41/58] return ggml_tensor * pair in kda_autoregressive and kda_chunking as in ngxson's Qwen3Next improvement --- src/models/kimi-linear.cpp | 59 +++++++++++++++----------------------- src/models/models.h | 4 +-- 2 files changed, 25 insertions(+), 38 deletions(-) diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index b0330e23b3..4831b7bbc7 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -263,34 +263,21 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll ggml_tensor * state = build_rs(inp_rs, ssm_states_all, hparams.n_embd_s(), n_seqs); state = ggml_reshape_4d(ctx0, state, head_dim, head_dim, n_head, n_seqs); // Choose between build_kda_chunking and build_kda_recurrent based on n_tokens - ggml_tensor * attn_out = n_seq_tokens == 1 ? + std::pair attn_out = n_seq_tokens == 1 ? build_kda_autoregressive(Qcur, Kcur, Vcur, g1, beta, state, il) : build_kda_chunking(Qcur, Kcur, Vcur, g1, beta, state, chunked_causal_mask, chunked_identity, chunked_diag_mask, il); - cb(attn_out, "attn_out", il); - // The tensors were concatenated 1d, so we need to extract them 1d as well - const int64_t output_flat_size = head_dim * n_head * n_seq_tokens * n_seqs; - ggml_tensor * attn_out_1d = ggml_view_1d(ctx0, attn_out, output_flat_size, 0); - cb(attn_out_1d, "attn_out_1d", il); + ggml_tensor * output = attn_out.first; + ggml_tensor * new_state = attn_out.second; + cb(output, "attn_output", il); + cb(new_state, "new_state", il); - ggml_tensor * attn_out_final = ggml_reshape_3d(ctx0, attn_out_1d, head_dim, n_head, n_seq_tokens * n_seqs); - cb(attn_out_final, "attn_out_reshaped", il); - // Extract the state part (second part of the concatenated tensor) - // State starts after n_tokens elements along dimension 1 - const int64_t state_flat_size = head_dim * head_dim * n_head * n_seqs; - - ggml_tensor * state_1d = - ggml_view_1d(ctx0, attn_out, state_flat_size, output_flat_size * ggml_element_size(attn_out)); - cb(state_1d, "state_1d", il); - - // Update the recurrent states - ggml_build_forward_expand(gf, - ggml_cpy(ctx0, state_1d, + // Update the recurrent states + ggml_build_forward_expand(gf, + ggml_cpy(ctx0, new_state, ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs, kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all)))); - GGML_ASSERT(ggml_nelements(attn_out_1d) + ggml_nelements(state_1d) == ggml_nelements(attn_out)); - // Step 7: Output gating g2 = g_b(g_a(x)) ggml_tensor * cur_2d = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs); ggml_tensor * g_a = ggml_mul_mat(ctx0, layer.ssm_g_a, cur_2d); @@ -301,6 +288,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll // Step 8: Apply o_norm with sigmoid gating // Note: Kimi model uses sigmoid gating, not SiLU (despite FusedRMSNormGated default being swish) // Formula: output = RMSNorm(x) * sigmoid(g) + ggml_tensor * attn_out_final = ggml_reshape_3d(ctx0, output, head_dim, n_head, n_seq_tokens * n_seqs); ggml_tensor * normed = build_norm(attn_out_final, layer.ssm_o_norm, layer.ssm_o_norm_b, LLM_NORM_RMS, il); cb(normed, "kda_normed", il); ggml_tensor * gate = ggml_sigmoid(ctx0, g2); @@ -496,7 +484,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll This is a ggml implementation of the naive_chunk_kda function of https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/kda/naive.py */ -ggml_tensor * llm_build_kimi_linear::build_kda_chunking( +std::pair llm_build_kimi_linear::build_kda_chunking( ggml_tensor * q, ggml_tensor * k, ggml_tensor * v, @@ -774,20 +762,23 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking( core_attn_out = ggml_cont_4d(ctx0, core_attn_out, S_v, chunk_size * n_chunks, H_v, n_seqs); - ggml_tensor * output_tokens = ggml_view_4d(ctx0, core_attn_out, S_v, n_tokens, H_v, n_seqs, core_attn_out->nb[1], core_attn_out->nb[2], core_attn_out->nb[3], 0); - cb(output_tokens, "output_tokens", il); + // truncate padded tokens + ggml_tensor * output_tokens = ggml_view_4d(ctx0, core_attn_out, + S_v, n_tokens, H_v, n_seqs, + ggml_row_size(core_attn_out->type, S_v), + ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks), + ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks * H_v), 0); + output_tokens = ggml_cont(ctx0, output_tokens); + // permute back to (S_v, H_v, n_tokens, n_seqs) + output_tokens = ggml_permute(ctx0, output_tokens, 0, 2, 1, 3); + output_tokens = ggml_cont(ctx0, output_tokens); - // flatten output - ggml_tensor * flat_output = - ggml_cont_1d(ctx0, ggml_permute(ctx0, output_tokens, 0, 2, 1, 3), S_v * H_v * n_tokens * n_seqs); - - ggml_tensor * flat_state = ggml_cont_1d(ctx0, new_state, S_v * S_v * H_v * n_seqs); cb(new_state, "output_state", il); - return ggml_concat(ctx0, flat_output, flat_state, 0); + return {output_tokens, new_state}; } -ggml_tensor * llm_build_kimi_linear::build_kda_autoregressive( +std::pair llm_build_kimi_linear::build_kda_autoregressive( ggml_tensor * q, ggml_tensor * k, ggml_tensor * v, @@ -876,10 +867,6 @@ ggml_tensor * llm_build_kimi_linear::build_kda_autoregressive( cb(core_attn_out, "output_tokens", il); cb(state, "new_state", il); - // flatten output, no need to permute since n_tokens is 1 so [S_v, 1, H_v, n_seqs] and [S_v, H_v, 1, n_seqs] are equivalent memory-layout wise - ggml_tensor * flat_output = ggml_reshape_1d(ctx0, core_attn_out, S_v * H_v * n_tokens * n_seqs); - ggml_tensor * flat_state = ggml_reshape_1d(ctx0, state, S_v * S_v * H_v * n_seqs); - - return ggml_concat(ctx0, flat_output, flat_state, 0); + return {core_attn_out, state}; } diff --git a/src/models/models.h b/src/models/models.h index 549329e15a..8e8f502e78 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -287,7 +287,7 @@ struct llm_build_jamba : public llm_graph_context_mamba { struct llm_build_kimi_linear : public llm_graph_context_mamba { llm_build_kimi_linear(const llama_model & model, const llm_graph_params & params); private: - ggml_tensor * build_kda_autoregressive( + std::pair build_kda_autoregressive( ggml_tensor * q, ggml_tensor * k, ggml_tensor * v, @@ -296,7 +296,7 @@ private: ggml_tensor * state, int il); - ggml_tensor * build_kda_chunking( + std::pair build_kda_chunking( ggml_tensor * q, ggml_tensor * k, ggml_tensor * v, From 6ba78d1220c0dd8bf8b37574cd0eb00650761188 Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Tue, 13 Jan 2026 16:31:29 +0800 Subject: [PATCH 42/58] removed Kimi-Linear specific change that causes failure at server-windows --- src/llama-graph.cpp | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index b0a6ea323f..67f6712744 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1963,15 +1963,11 @@ ggml_tensor * llm_graph_context::build_rs( ggml_tensor * output_states = get_state_rows(ctx0, states, state_copy_main); ggml_build_forward_expand(gf, output_states); - // copy extra states which won't be changed further (between n_seqs and n_rs) - // Skip if there are no extra states to copy (n_rs == n_seqs) - if (arch != LLM_ARCH_KIMI_LINEAR || n_rs > (u_int32_t) n_seqs) { // arch check for backward compat - ggml_tensor * states_extra = ggml_get_rows(ctx0, states, state_copy_extra); - ggml_build_forward_expand(gf, - ggml_cpy(ctx0, - states_extra, - ggml_view_1d(ctx0, s, state_size*(n_rs - n_seqs), (rs_head + n_seqs)*state_size*ggml_element_size(s)))); - } + ggml_tensor * states_extra = ggml_get_rows(ctx0, states, state_copy_extra); + ggml_build_forward_expand(gf, + ggml_cpy(ctx0, + states_extra, + ggml_view_1d(ctx0, s, state_size*(n_rs - n_seqs), (rs_head + n_seqs)*state_size*ggml_element_size(s)))); return output_states; } From fe9d248ae6f845756769504330803bf8a685307f Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Tue, 13 Jan 2026 16:58:59 +0800 Subject: [PATCH 43/58] removed private: from kimi_linear to make build checks happy --- src/models/models.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/models/models.h b/src/models/models.h index beb2d71af3..aec6d3bf48 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -286,7 +286,7 @@ struct llm_build_jamba : public llm_graph_context_mamba { struct llm_build_kimi_linear : public llm_graph_context_mamba { llm_build_kimi_linear(const llama_model & model, const llm_graph_params & params); -private: + std::pair build_kda_autoregressive( ggml_tensor * q, ggml_tensor * k, From 18ae7f4684ccc0ab72eaf35845b84b009da5486d Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Wed, 14 Jan 2026 03:22:53 +0800 Subject: [PATCH 44/58] removed unnecessary ggml_cont before ggml_reshape --- src/models/kimi-linear.cpp | 33 ++++++++++----------------------- 1 file changed, 10 insertions(+), 23 deletions(-) diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index 4831b7bbc7..50cebb9631 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -134,12 +134,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll ggml_tensor * conv_weight = nullptr; if (layer.ssm_q_conv) { // Reshape conv weight from [d_conv, 1, d_inner, 1] to [d_conv, d_inner] for ggml_ssm_conv - // Cast to F32 if quantized (ggml_ssm_conv requires float weights) - ggml_tensor * q_conv_f32 = layer.ssm_q_conv; - if (q_conv_f32->type != GGML_TYPE_F32) { - q_conv_f32 = ggml_cast(ctx0, q_conv_f32, GGML_TYPE_F32); - } - conv_weight = ggml_reshape_2d(ctx0, q_conv_f32, d_conv, d_inner); + conv_weight = ggml_reshape_2d(ctx0, layer.ssm_q_conv, d_conv, d_inner); } // Apply conv1d @@ -166,7 +161,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll ggml_tensor * Kcur; if (layer.ssm_k_conv) { ggml_tensor * k_3d = ggml_reshape_3d(ctx0, k_proj, d_inner, n_seq_tokens, n_seqs); - ggml_tensor * conv_k = ggml_cont(ctx0, ggml_concat(ctx0, conv_state_k, ggml_transpose(ctx0, k_3d), 0)); + ggml_tensor * conv_k = ggml_concat(ctx0, conv_state_k, ggml_transpose(ctx0, k_3d), 0); // Save K conv state ggml_tensor * last_conv_k = ggml_view_3d(ctx0, conv_k, d_conv - 1, d_inner, n_seqs, @@ -176,11 +171,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll ggml_view_1d(ctx0, conv_states_all, conv_state_size * n_seqs, (kv_head * n_embd_r_total + conv_state_size) * ggml_element_size(conv_states_all)))); - ggml_tensor * k_conv_f32 = layer.ssm_k_conv; - if (k_conv_f32->type != GGML_TYPE_F32) { - k_conv_f32 = ggml_cast(ctx0, k_conv_f32, GGML_TYPE_F32); - } - ggml_tensor * k_conv_weight = ggml_reshape_2d(ctx0, k_conv_f32, d_conv, d_inner); + ggml_tensor * k_conv_weight = ggml_reshape_2d(ctx0, layer.ssm_k_conv, d_conv, d_inner); Kcur = ggml_ssm_conv(ctx0, conv_k, k_conv_weight); cb(Kcur, "K conv1d", il); Kcur = ggml_reshape_2d(ctx0, Kcur, d_inner, n_tokens); @@ -197,7 +188,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll ggml_tensor * Vcur; if (layer.ssm_v_conv) { ggml_tensor * v_3d = ggml_reshape_3d(ctx0, v_proj, d_inner, n_seq_tokens, n_seqs); - ggml_tensor * conv_v = ggml_cont(ctx0, ggml_concat(ctx0, conv_state_v, ggml_transpose(ctx0, v_3d), 0)); + ggml_tensor * conv_v = ggml_concat(ctx0, conv_state_v, ggml_transpose(ctx0, v_3d), 0); // Save V conv state ggml_tensor * last_conv_v = ggml_view_3d(ctx0, conv_v, d_conv - 1, d_inner, n_seqs, @@ -207,11 +198,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll ggml_view_1d(ctx0, conv_states_all, conv_state_size * n_seqs, (kv_head * n_embd_r_total + 2 * conv_state_size) * ggml_element_size(conv_states_all)))); - ggml_tensor * v_conv_f32 = layer.ssm_v_conv; - if (v_conv_f32->type != GGML_TYPE_F32) { - v_conv_f32 = ggml_cast(ctx0, v_conv_f32, GGML_TYPE_F32); - } - ggml_tensor * v_conv_weight = ggml_reshape_2d(ctx0, v_conv_f32, d_conv, d_inner); + ggml_tensor * v_conv_weight = ggml_reshape_2d(ctx0, layer.ssm_v_conv, d_conv, d_inner); Vcur = ggml_ssm_conv(ctx0, conv_v, v_conv_weight); cb(Vcur, "V conv1d", il); Vcur = ggml_reshape_2d(ctx0, Vcur, d_inner, n_tokens); @@ -243,17 +230,17 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll // Step 4: Compute beta (mixing coefficient) ggml_tensor * beta = ggml_mul_mat(ctx0, layer.ssm_beta, cur); - beta = ggml_cont_4d(ctx0, beta, n_head, 1, n_seq_tokens, n_seqs); + beta = ggml_reshape_4d(ctx0, beta, n_head, 1, n_seq_tokens, n_seqs); cb(beta, "kda_beta", il); // Step 5: Reshape for KDA recurrence // {n_embd, n_tokens} -> {n_embd, n_seq_tokens, n_seqs} cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs); - Qcur = ggml_cont(ctx0, ggml_reshape_4d(ctx0, Qcur, head_dim, n_head, n_seq_tokens, n_seqs)); - Kcur = ggml_cont(ctx0, ggml_reshape_4d(ctx0, Kcur, head_dim, n_head, n_seq_tokens, n_seqs)); - Vcur = ggml_cont(ctx0, ggml_reshape_4d(ctx0, Vcur, head_dim, n_head, n_seq_tokens, n_seqs)); - g1 = ggml_cont(ctx0, ggml_reshape_4d(ctx0, g1, head_dim, n_head, n_seq_tokens, n_seqs)); + Qcur = ggml_reshape_4d(ctx0, Qcur, head_dim, n_head, n_seq_tokens, n_seqs); + Kcur = ggml_reshape_4d(ctx0, Kcur, head_dim, n_head, n_seq_tokens, n_seqs); + Vcur = ggml_reshape_4d(ctx0, Vcur, head_dim, n_head, n_seq_tokens, n_seqs); + g1 = ggml_reshape_4d(ctx0, g1, head_dim, n_head, n_seq_tokens, n_seqs); cb(Qcur, "kda_Q", il); cb(Kcur, "kda_K", il); cb(Vcur, "kda_V", il); From 28829152588d35ba8e187f983ecdb1d4cfd232f8 Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Wed, 14 Jan 2026 17:26:00 +0800 Subject: [PATCH 45/58] created static function causal_conv1d to abtract similar code for q/k/v --- src/models/kimi-linear.cpp | 201 ++++++++++++------------------------- 1 file changed, 64 insertions(+), 137 deletions(-) diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index 50cebb9631..25eccd2f7d 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -3,6 +3,67 @@ #define CHUNK_SIZE 64 +// Causal Conv1d function for Q,K,V +// When qkv is 0, it is Q, 1 is K, 2 is V +static ggml_tensor * causal_conv1d(ggml_cgraph * gf, ggml_context * ctx0, ggml_tensor * conv_states_all, ggml_tensor * conv_state_all, int64_t qkv, ggml_tensor * x, ggml_tensor * proj_w, ggml_tensor * conv_w, ggml_tensor * conv_b, int64_t d_conv, int64_t head_dim, int64_t n_head, int64_t n_seq_tokens, int64_t n_seqs, int64_t n_tokens, int64_t kv_head) { + const int64_t d_inner = head_dim * n_head; + const int64_t conv_state_size = (d_conv - 1) * d_inner; + const int64_t n_embd_r_total = 3 * conv_state_size; // Q + K + V + + // conv_state_all is [n_embd_r_total, n_seqs], split into Q, K, V + // Each conv state is [(d_conv-1) * d_inner] per sequence, need to reshape to [d_conv-1, d_inner, n_seqs] + // Memory layout: for each seq, Q state is first conv_state_size elements, then K, then V + // conv_state_all has stride: nb[0] = element_size, nb[1] = n_embd_r_total * element_size + // View Q conv state: offset 0, size conv_state_size per seq + // conv_state_all is [n_embd_r_total, n_seqs] with memory layout: + // state[i + seq * n_embd_r_total] where i = conv_step + channel * (d_conv-1) + {0, conv_state_size, 2*conv_state_size} for Q/K/V + // We want [d_conv-1, d_inner, n_seqs] view: + // nb1 = (d_conv-1) * element_size (stride between channels) + // nb2 = n_embd_r_total * element_size (stride between seqs) + ggml_tensor * conv_state_x = ggml_view_3d(ctx0, conv_state_all, d_conv - 1, d_inner, n_seqs, + (d_conv - 1) * ggml_element_size(conv_state_all), // nb1: stride between channels + n_embd_r_total * ggml_element_size(conv_state_all), // nb2: stride between seqs + qkv * conv_state_size * ggml_element_size(conv_state_all)); + +// Causal Conv1d function for Q,K,V +// When qkv is 0, it is Q, 1 is K, 2 is V + // Step 1: Q, K, V projections -> [d_inner, n_tokens] + ggml_tensor * x_proj = ggml_mul_mat(ctx0, proj_w, x); + + // Reshape input: {d_inner, n_tokens} -> {d_inner, n_seq_tokens, n_seqs} + ggml_tensor * x_3d = ggml_reshape_3d(ctx0, x_proj, d_inner, n_seq_tokens, n_seqs); + + // Concat Q conv state and current input: {d_conv-1 + n_seq_tokens, d_inner, n_seqs} + ggml_tensor * conv_x = ggml_cont(ctx0, ggml_concat(ctx0, conv_state_x, ggml_transpose(ctx0, x_3d), 0)); + + // Save last (d_conv-1) columns back to Q conv state + ggml_tensor * last_conv_x = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs, + conv_x->nb[1], conv_x->nb[2], n_seq_tokens * conv_x->nb[0]); + ggml_build_forward_expand(gf, + ggml_cpy(ctx0, last_conv_x, + ggml_view_1d(ctx0, conv_states_all, conv_state_size * n_seqs, + (kv_head * n_embd_r_total + qkv * conv_state_size) * ggml_element_size(conv_states_all)))); + // Reshape conv weight: GGUF [d_conv, 1, d_inner, 1] -> ggml_ssm_conv expects [d_conv, d_inner] + // GGUF stores as [d_conv, 1, d_inner, 1] with memory layout w[conv_step + channel * d_conv] + // vLLM stores as [d_inner, d_conv] with memory layout w[channel * d_conv + conv_step] + // ggml_ssm_conv computes: c[conv_step + channel * d_conv] + // GGUF layout: [d_conv, 1, d_inner] or [d_conv, 1, d_inner, 1] -> reshape to [d_conv, d_inner] + // Reshape conv weight from [d_conv, 1, d_inner, 1] to [d_conv, d_inner] for ggml_ssm_conv + ggml_tensor * conv_weight = ggml_reshape_2d(ctx0, conv_w, d_conv, d_inner); + + // Apply conv1d + // ggml_ssm_conv output: {d_inner, n_seq_tokens, n_seqs} + ggml_tensor * Xcur = ggml_ssm_conv(ctx0, conv_x, conv_weight); + // Reshape to 2D for bias add: {d_inner, n_tokens} + Xcur = ggml_reshape_2d(ctx0, Xcur, d_inner, n_tokens); + if (conv_b) { + Xcur = ggml_add(ctx0, Xcur, conv_b); + } + Xcur = ggml_silu(ctx0, Xcur); + + return ggml_reshape_4d(ctx0, Xcur, head_dim, n_head, n_seq_tokens, n_seqs); +} + llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params), model(model) { ggml_tensor * cur; @@ -78,138 +139,10 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll // Get conv states from r_l tensor (Q, K, V each have separate state) ggml_tensor * conv_states_all = mctx_cur->get_r_l(il); cb(conv_states_all, "conv_states_all", il); - const int64_t conv_state_size = (d_conv - 1) * d_inner; - const int64_t n_embd_r_total = 3 * conv_state_size; // Q + K + V ggml_tensor * conv_state_all = build_rs(inp_rs, conv_states_all, hparams.n_embd_r(), n_seqs); - // conv_state_all is [n_embd_r_total, n_seqs], split into Q, K, V - // Each conv state is [(d_conv-1) * d_inner] per sequence, need to reshape to [d_conv-1, d_inner, n_seqs] - // Memory layout: for each seq, Q state is first conv_state_size elements, then K, then V - // conv_state_all has stride: nb[0] = element_size, nb[1] = n_embd_r_total * element_size - // View Q conv state: offset 0, size conv_state_size per seq - // conv_state_all is [n_embd_r_total, n_seqs] with memory layout: - // state[i + seq * n_embd_r_total] where i = conv_step + channel * (d_conv-1) + {0, conv_state_size, 2*conv_state_size} for Q/K/V - // We want [d_conv-1, d_inner, n_seqs] view: - // nb1 = (d_conv-1) * element_size (stride between channels) - // nb2 = n_embd_r_total * element_size (stride between seqs) - ggml_tensor * conv_state_q = ggml_view_3d(ctx0, conv_state_all, d_conv - 1, d_inner, n_seqs, - (d_conv - 1) * ggml_element_size(conv_state_all), // nb1: stride between channels - n_embd_r_total * ggml_element_size(conv_state_all), // nb2: stride between seqs - 0); // offset for Q - ggml_tensor * conv_state_k = ggml_view_3d(ctx0, conv_state_all, d_conv - 1, d_inner, n_seqs, - (d_conv - 1) * ggml_element_size(conv_state_all), - n_embd_r_total * ggml_element_size(conv_state_all), - conv_state_size * ggml_element_size(conv_state_all)); // offset for K - ggml_tensor * conv_state_v = ggml_view_3d(ctx0, conv_state_all, d_conv - 1, d_inner, n_seqs, - (d_conv - 1) * ggml_element_size(conv_state_all), - n_embd_r_total * ggml_element_size(conv_state_all), - 2 * conv_state_size * ggml_element_size(conv_state_all)); // offset for V - - // Step 1: Q, K, V projections -> [d_inner, n_tokens] - ggml_tensor * q_proj = ggml_mul_mat(ctx0, layer.wq, cur); - ggml_tensor * k_proj = ggml_mul_mat(ctx0, layer.wk, cur); - ggml_tensor * v_proj = ggml_mul_mat(ctx0, layer.wv, cur); - cb(q_proj, "kda_q_proj", il); - cb(k_proj, "kda_k_proj", il); - cb(v_proj, "kda_v_proj", il); - - // Step 2: Causal Conv1d for Q - // Reshape input: {d_inner, n_tokens} -> {d_inner, n_seq_tokens, n_seqs} - ggml_tensor * q_3d = ggml_reshape_3d(ctx0, q_proj, d_inner, n_seq_tokens, n_seqs); - - // Concat Q conv state and current input: {d_conv-1 + n_seq_tokens, d_inner, n_seqs} - ggml_tensor * conv_q = ggml_concat(ctx0, conv_state_q, ggml_transpose(ctx0, q_3d), 0); - - // Save last (d_conv-1) columns back to Q conv state - ggml_tensor * last_conv_q = ggml_view_3d(ctx0, conv_q, d_conv - 1, d_inner, n_seqs, - conv_q->nb[1], conv_q->nb[2], n_seq_tokens * conv_q->nb[0]); - ggml_build_forward_expand(gf, - ggml_cpy(ctx0, last_conv_q, - ggml_view_1d(ctx0, conv_states_all, conv_state_size * n_seqs, - kv_head * n_embd_r_total * ggml_element_size(conv_states_all)))); - // Reshape conv weight: GGUF [d_conv, 1, d_inner, 1] -> ggml_ssm_conv expects [d_conv, d_inner] - // GGUF stores as [d_conv, 1, d_inner, 1] with memory layout w[conv_step + channel * d_conv] - // vLLM stores as [d_inner, d_conv] with memory layout w[channel * d_conv + conv_step] - // ggml_ssm_conv computes: c[conv_step + channel * d_conv] - // GGUF layout: [d_conv, 1, d_inner] or [d_conv, 1, d_inner, 1] -> reshape to [d_conv, d_inner] - ggml_tensor * conv_weight = nullptr; - if (layer.ssm_q_conv) { - // Reshape conv weight from [d_conv, 1, d_inner, 1] to [d_conv, d_inner] for ggml_ssm_conv - conv_weight = ggml_reshape_2d(ctx0, layer.ssm_q_conv, d_conv, d_inner); - } - - // Apply conv1d - ggml_tensor * Qcur; - if (conv_weight) { - // Make conv_q contiguous for ggml_ssm_conv - conv_q = ggml_cont(ctx0, conv_q); - - // ggml_ssm_conv output: {d_inner, n_seq_tokens, n_seqs} - Qcur = ggml_ssm_conv(ctx0, conv_q, conv_weight); - cb(Qcur, "Q conv1d", il); - // Reshape to 2D for bias add: {d_inner, n_tokens} - Qcur = ggml_reshape_2d(ctx0, Qcur, d_inner, n_tokens); - if (layer.ssm_q_conv_b) { - Qcur = ggml_add(ctx0, Qcur, layer.ssm_q_conv_b); - } - Qcur = ggml_silu(ctx0, Qcur); - cb(Qcur, "Q conv1d b", il); - } else { - GGML_ABORT("KDA layer missing Q conv weight"); - } - - // K conv1d (with separate K conv state) - ggml_tensor * Kcur; - if (layer.ssm_k_conv) { - ggml_tensor * k_3d = ggml_reshape_3d(ctx0, k_proj, d_inner, n_seq_tokens, n_seqs); - ggml_tensor * conv_k = ggml_concat(ctx0, conv_state_k, ggml_transpose(ctx0, k_3d), 0); - - // Save K conv state - ggml_tensor * last_conv_k = ggml_view_3d(ctx0, conv_k, d_conv - 1, d_inner, n_seqs, - conv_k->nb[1], conv_k->nb[2], n_seq_tokens * conv_k->nb[0]); - ggml_build_forward_expand(gf, - ggml_cpy(ctx0, last_conv_k, - ggml_view_1d(ctx0, conv_states_all, conv_state_size * n_seqs, - (kv_head * n_embd_r_total + conv_state_size) * ggml_element_size(conv_states_all)))); - - ggml_tensor * k_conv_weight = ggml_reshape_2d(ctx0, layer.ssm_k_conv, d_conv, d_inner); - Kcur = ggml_ssm_conv(ctx0, conv_k, k_conv_weight); - cb(Kcur, "K conv1d", il); - Kcur = ggml_reshape_2d(ctx0, Kcur, d_inner, n_tokens); - if (layer.ssm_k_conv_b) { - Kcur = ggml_add(ctx0, Kcur, layer.ssm_k_conv_b); - } - Kcur = ggml_silu(ctx0, Kcur); - cb(Kcur, "K conv1d b", il); - } else { - GGML_ABORT("KDA layer missing K conv weight"); - } - - // V conv1d (with separate V conv state) - ggml_tensor * Vcur; - if (layer.ssm_v_conv) { - ggml_tensor * v_3d = ggml_reshape_3d(ctx0, v_proj, d_inner, n_seq_tokens, n_seqs); - ggml_tensor * conv_v = ggml_concat(ctx0, conv_state_v, ggml_transpose(ctx0, v_3d), 0); - - // Save V conv state - ggml_tensor * last_conv_v = ggml_view_3d(ctx0, conv_v, d_conv - 1, d_inner, n_seqs, - conv_v->nb[1], conv_v->nb[2], n_seq_tokens * conv_v->nb[0]); - ggml_build_forward_expand(gf, - ggml_cpy(ctx0, last_conv_v, - ggml_view_1d(ctx0, conv_states_all, conv_state_size * n_seqs, - (kv_head * n_embd_r_total + 2 * conv_state_size) * ggml_element_size(conv_states_all)))); - - ggml_tensor * v_conv_weight = ggml_reshape_2d(ctx0, layer.ssm_v_conv, d_conv, d_inner); - Vcur = ggml_ssm_conv(ctx0, conv_v, v_conv_weight); - cb(Vcur, "V conv1d", il); - Vcur = ggml_reshape_2d(ctx0, Vcur, d_inner, n_tokens); - if (layer.ssm_v_conv_b) { - Vcur = ggml_add(ctx0, Vcur, layer.ssm_v_conv_b); - } - Vcur = ggml_silu(ctx0, Vcur); - cb(Vcur, "V conv1d b", il); - } else { - GGML_ABORT("KDA layer missing V conv weight"); - } + ggml_tensor * Qcur = causal_conv1d(gf, ctx0, conv_states_all, conv_state_all, 0, cur, layer.wq, layer.ssm_q_conv, layer.ssm_q_conv_b, d_conv, head_dim, n_head, n_seq_tokens, n_seqs, n_tokens, kv_head); + ggml_tensor * Kcur = causal_conv1d(gf, ctx0, conv_states_all, conv_state_all, 1, cur, layer.wk, layer.ssm_k_conv, layer.ssm_k_conv_b, d_conv, head_dim, n_head, n_seq_tokens, n_seqs, n_tokens, kv_head); + ggml_tensor * Vcur = causal_conv1d(gf, ctx0, conv_states_all, conv_state_all, 2, cur, layer.wv, layer.ssm_v_conv, layer.ssm_v_conv_b, d_conv, head_dim, n_head, n_seq_tokens, n_seqs, n_tokens, kv_head); // Step 3: Compute g1 (forget gate) // g1 = -exp(A_log) * softplus(f_b(f_a(x)) + dt_bias) @@ -237,13 +170,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll // {n_embd, n_tokens} -> {n_embd, n_seq_tokens, n_seqs} cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs); - Qcur = ggml_reshape_4d(ctx0, Qcur, head_dim, n_head, n_seq_tokens, n_seqs); - Kcur = ggml_reshape_4d(ctx0, Kcur, head_dim, n_head, n_seq_tokens, n_seqs); - Vcur = ggml_reshape_4d(ctx0, Vcur, head_dim, n_head, n_seq_tokens, n_seqs); g1 = ggml_reshape_4d(ctx0, g1, head_dim, n_head, n_seq_tokens, n_seqs); - cb(Qcur, "kda_Q", il); - cb(Kcur, "kda_K", il); - cb(Vcur, "kda_V", il); // Step 6: Get SSM state and compute KDA recurrence using ggml_kda_scan ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il); From 0aea18e718d38ff9ec27e144e4fc7bee8b875a41 Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Fri, 16 Jan 2026 12:02:27 +0800 Subject: [PATCH 46/58] merged dt_bias to SSM_DT. Do -exp(log_A) in convert_hf_to_gguf.py. --- convert_hf_to_gguf.py | 59 +++++++++++++--------------------- gguf-py/gguf/constants.py | 13 +++----- gguf-py/gguf/tensor_mapping.py | 10 ++---- src/llama-arch.cpp | 8 ++--- src/llama-arch.h | 2 -- src/llama-model.cpp | 18 ++++------- src/llama-model.h | 1 - src/models/kimi-linear.cpp | 10 +++--- 8 files changed, 43 insertions(+), 78 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index bd017dfec4..0e1b6aae99 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -5159,17 +5159,14 @@ class KimiLinearModel(TextModel): super().set_gguf_parameters() self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) - self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) - # Use find_hparam for context length - # Kimi uses model_max_length - n_ctx = self.find_hparam(["max_position_embeddings", "model_max_length", "n_ctx", "n_positions"], optional=True) - if n_ctx is not None: - self.gguf_writer.add_context_length(n_ctx) - else: - # Default to 4096 if not found - logger.warning("No context length found in config, defaulting to 4096") - self.gguf_writer.add_context_length(4096) + if (score_func := self.find_hparam(["moe_router_activation_func"], optional=True)) is not None: + if score_func == "sigmoid": + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) + elif score_func == "softmax": + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX) + else: + raise ValueError(f"Unsupported expert score gating function value: {score_func}") # KDA & MLA params # Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv @@ -5226,7 +5223,7 @@ class KimiLinearModel(TextModel): self.gguf_writer.add_value_length_mla(v_head_dim) # Rotation - use qk_rope_head_dim for Kimi - rope_dim = self.hparams.get("qk_rope_head_dim") or self.hparams.get("n_rot") + rope_dim = self.find_hparam(["qk_rope_head_dim", "n_rot"]) if rope_dim is not None: self.gguf_writer.add_rope_dimension_count(rope_dim) else: @@ -5234,41 +5231,30 @@ class KimiLinearModel(TextModel): head_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(head_dim) - # Copied from Qwen2Moe as this model inherits parts of it - # YaRN is not enabled by default - # To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) - - # MoE params - n_experts = self.hparams.get("num_local_experts", self.hparams.get("num_experts")) + n_experts = self.find_hparam(["num_experts"]) if n_experts is not None: self.gguf_writer.add_expert_count(n_experts) - # Support both num_experts_per_tok and num_experts_per_token - n_experts_used = self.hparams.get("num_experts_per_tok", self.hparams.get("num_experts_per_token")) + n_experts_used = self.find_hparam(["num_experts_per_token"]) if n_experts_used is not None: self.gguf_writer.add_expert_used_count(n_experts_used) # moe_intermediate_size (1024 for Kimi) - moe_intermediate_size = self.hparams.get("moe_intermediate_size") + moe_intermediate_size = self.find_hparam(["moe_intermediate_size"]) if moe_intermediate_size is not None: self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) # num_shared_experts (1 for Kimi) - num_shared_experts = self.hparams.get("num_shared_experts") + num_shared_experts = self.find_hparam(["num_shared_experts"]) if num_shared_experts is not None: self.gguf_writer.add_expert_shared_count(num_shared_experts) # first_k_dense_replace (1 for Kimi - first layer uses dense MLP) - first_k_dense_replace = self.hparams.get("first_k_dense_replace") + first_k_dense_replace = self.find_hparam(["first_k_dense_replace"]) if first_k_dense_replace is not None: self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace) # Routed scaling factor (expert_weights_scale = 2.446 for Kimi) - routed_scaling_factor = self.hparams.get("routed_scaling_factor") + routed_scaling_factor = self.find_hparam(["routed_scaling_factor"]) if routed_scaling_factor is not None: self.gguf_writer.add_expert_weights_scale(routed_scaling_factor) @@ -5301,19 +5287,20 @@ class KimiLinearModel(TextModel): data_torch = data_torch.reshape(1, d_inner, 1, d_conv) logger.info(f"Reshaped conv1d weight {name}: [d_inner={d_inner}, 1, d_conv={d_conv}] -> numpy {tuple(data_torch.shape)} -> ggml ne=[{d_conv}, 1, {d_inner}, 1]") - # Handle A_log: HF stores as [1, 1, num_heads, 1] - # llama.cpp expects ggml ne = [1, num_heads, 1, 1] - # GGUF reverses numpy shape: numpy (1, 1, num_heads, 1) -> ggml ne = [1, num_heads, 1, 1] - # So no transformation needed! The shapes already match after GGUF reversal. - if name.endswith(".A_log"): - if data_torch.ndim == 4: - logger.info(f"A_log {name}: numpy {tuple(data_torch.shape)} -> ggml ne={list(reversed(data_torch.shape))}") - # Kimi specific bias if name.endswith("e_score_correction_bias"): new_name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_EXP_PROBS_B, bid) return [(new_name, data_torch)] + # Handle A_log: iHF stores as [1, 1, num_heads, 1] + # llama.cpp expects ggml ne = [1, num_heads, 1, 1] + # GGUF reverses numpy shape: numpy (1, 1, num_heads, 1) -> ggml ne = [1, num_heads, 1, 1] + if name.endswith(".A_log"): + data_torch = -torch.exp(data_torch) + if name.endswith(".dt_bias"): + name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias" + logger.info("Changed dt_bias to dt_proj.bias") + # process the experts separately if name.find("block_sparse_moe.experts") != -1: n_experts = self.hparams.get("num_local_experts", self.hparams.get("num_experts")) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 43ea4eec0c..73e7bae6e1 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -462,7 +462,7 @@ class MODEL_ARCH(IntEnum): MIMO2 = auto() LLAMA_EMBED = auto() MAINCODER = auto() - KIMI_LINEAR = auto() # Kimi-Linear (hybrid MLA+KDA) + KIMI_LINEAR = auto() class VISION_PROJECTOR_TYPE(IntEnum): @@ -559,10 +559,9 @@ class MODEL_TENSOR(IntEnum): SSM_F_A = auto() # Kimi Linear SSM_F_B = auto() # Kimi Linear SSM_BETA = auto() # Kimi Linear - SSM_A_LOG = auto() # Kimi Linear + SSM_DT_B = auto() # Kimi Linear SSM_G_A = auto() # Kimi Linear SSM_G_B = auto() # Kimi Linear - SSM_DT_B = auto() # Kimi Linear TIME_MIX_W0 = auto() TIME_MIX_W1 = auto() TIME_MIX_W2 = auto() @@ -894,7 +893,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.MIMO2: "mimo2", MODEL_ARCH.LLAMA_EMBED: "llama-embed", MODEL_ARCH.MAINCODER: "maincoder", - MODEL_ARCH.KIMI_LINEAR: "kimi-linear", + MODEL_ARCH.KIMI_LINEAR: "kimi-linear", } VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = { @@ -988,10 +987,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.SSM_F_A: "blk.{bid}.ssm_f_a", # Kimi Linear MODEL_TENSOR.SSM_F_B: "blk.{bid}.ssm_f_b", # Kimi Linear MODEL_TENSOR.SSM_BETA: "blk.{bid}.ssm_beta", # Kimi Linear - MODEL_TENSOR.SSM_A_LOG: "blk.{bid}.ssm_a", # Kimi Linear MODEL_TENSOR.SSM_G_A: "blk.{bid}.ssm_g_a", # Kimi Linear MODEL_TENSOR.SSM_G_B: "blk.{bid}.ssm_g_b", # Kimi Linear - MODEL_TENSOR.SSM_DT_B: "blk.{bid}.ssm_dt", # Kimi Linear MODEL_TENSOR.TIME_MIX_W0: "blk.{bid}.time_mix_w0", MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1", MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2", @@ -3433,11 +3430,11 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.SSM_F_A, MODEL_TENSOR.SSM_F_B, MODEL_TENSOR.SSM_BETA, - MODEL_TENSOR.SSM_A_LOG, + MODEL_TENSOR.SSM_A, MODEL_TENSOR.SSM_G_A, MODEL_TENSOR.SSM_G_B, + MODEL_TENSOR.SSM_DT, MODEL_TENSOR.SSM_NORM, - MODEL_TENSOR.SSM_DT_B, MODEL_TENSOR.FFN_EXP_PROBS_B, MODEL_TENSOR.FFN_GATE_SHEXP, MODEL_TENSOR.FFN_DOWN_SHEXP, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 99da6891f8..d96119ebe9 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -438,7 +438,6 @@ class TensorNameMap: "model.layers.{bid}.block_sparse_moe.e_score_correction", # minimax-m2 "backbone.layers.{bid}.mixer.gate.e_score_correction", # nemotron-h-moe "model.layers.{bid}.mlp.e_score_correction", # exaone-moe - "model.layers.{bid}.block_sparse_moe.gate.e_score_correction_bias", # kimi ), # Feed-forward up @@ -556,7 +555,6 @@ class TensorNameMap: MODEL_TENSOR.FFN_GATE_CHEXP: ( "model.layers.{bid}.mlp.chunk_experts.gate_proj", # grovemoe - "model.layers.{bid}.block_sparse_moe.shared_experts.gate_proj", # kimi ), # Feed-forward down @@ -764,6 +762,7 @@ class TensorNameMap: "model.layers.layers.{bid}.mixer.dt_proj", # plamo2 "model.layers.{bid}.linear_attn.dt_proj", # qwen3next "backbone.layers.{bid}.mixer.dt", # nemotron-h-moe + "model.layers.{bid}.self_attn.dt_proj", # kimi ), MODEL_TENSOR.SSM_DT_NORM: ( @@ -777,6 +776,7 @@ class TensorNameMap: "model.layers.{bid}.mamba.A_log", # jamba falcon-h1 granite-hybrid "model.layers.layers.{bid}.mixer.A_log", # plamo2 "model.layers.{bid}.linear_attn.A_log", # qwen3next + "model.layers.{bid}.self_attn.A_log", # kimi ), MODEL_TENSOR.SSM_B_NORM: ( @@ -836,18 +836,12 @@ class TensorNameMap: MODEL_TENSOR.SSM_BETA: ( "model.layers.{bid}.self_attn.b_proj", ), - MODEL_TENSOR.SSM_A_LOG: ( - "model.layers.{bid}.self_attn.A_log", - ), MODEL_TENSOR.SSM_G_A: ( "model.layers.{bid}.self_attn.g_a_proj", ), MODEL_TENSOR.SSM_G_B: ( "model.layers.{bid}.self_attn.g_b_proj", ), - MODEL_TENSOR.SSM_DT_B: ( - "model.layers.{bid}.self_attn.dt_bias", - ), MODEL_TENSOR.TIME_MIX_W0: ( "model.layers.{bid}.attention.w0", # rwkv7 ), diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 654276542d..a8bf1c9b80 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -380,8 +380,6 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_SSM_F_A, "blk.%d.ssm_f_a" }, { LLM_TENSOR_SSM_F_B, "blk.%d.ssm_f_b" }, { LLM_TENSOR_SSM_BETA, "blk.%d.ssm_beta" }, - { LLM_TENSOR_SSM_A_LOG, "blk.%d.ssm_a" }, - { LLM_TENSOR_SSM_DT_B, "blk.%d.ssm_dt" }, { LLM_TENSOR_SSM_G_A, "blk.%d.ssm_g_a" }, { LLM_TENSOR_SSM_G_B, "blk.%d.ssm_g_b" }, { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" }, @@ -2336,10 +2334,10 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_SSM_F_A, LLM_TENSOR_SSM_F_B, LLM_TENSOR_SSM_BETA, - LLM_TENSOR_SSM_A_LOG, - LLM_TENSOR_SSM_DT_B, + LLM_TENSOR_SSM_A, LLM_TENSOR_SSM_G_A, LLM_TENSOR_SSM_G_B, + LLM_TENSOR_SSM_DT, LLM_TENSOR_SSM_NORM, // MLA LLM_TENSOR_ATTN_Q_A, @@ -2461,8 +2459,6 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_SSM_F_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_SSM_F_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_SSM_BETA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_SSM_A_LOG, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, - {LLM_TENSOR_SSM_DT_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_SSM_G_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_SSM_G_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_TIME_MIX_LERP_X, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, diff --git a/src/llama-arch.h b/src/llama-arch.h index e5816acee1..f092f72834 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -408,8 +408,6 @@ enum llm_tensor { LLM_TENSOR_SSM_F_A, // kimi: forget gate projection A LLM_TENSOR_SSM_F_B, // kimi: forget gate projection B LLM_TENSOR_SSM_BETA, // kimi: beta mixing coefficient - LLM_TENSOR_SSM_A_LOG, // kimi: A_log (pre-converted in GGUF) - LLM_TENSOR_SSM_DT_B, // kimi: dt bias LLM_TENSOR_SSM_G_A, // kimi: output gate projection A LLM_TENSOR_SSM_G_B, // kimi: output gate projection B LLM_TENSOR_TIME_MIX_W0, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 9b796b3675..53f9f389e4 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2468,7 +2468,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false); ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false); ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); - ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false); + ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func); switch (hparams.n_layer) { case 27: type = LLM_TYPE_48B_A3B; break; // Kimi-Linear-48B-A3B @@ -6839,14 +6839,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // b_proj (beta mixing coefficient) layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", i), {n_embd, n_head}, 0); - // A_log - Shape in GGUF: [1, num_heads, 1, 1] (4D) or [1, num_heads] (2D after quantization) - layer.ssm_a_log = create_tensor(tn(LLM_TENSOR_SSM_A_LOG, i), {1, n_head, 1, 1}, TENSOR_NOT_REQUIRED); - if (!layer.ssm_a_log) { - layer.ssm_a_log = create_tensor(tn(LLM_TENSOR_SSM_A_LOG, i), {1, n_head}, 0); + // A_log - Shape in GGUF: [1, num_heads, 1, 1] (4D) or [1, num_heads] (2D after quantization) Note: -exp(A_log) is applied in convert_hf_to_gguf.py + layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head, 1, 1}, TENSOR_NOT_REQUIRED); + if (!layer.ssm_a) { + layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0); } // dt_bias - shape [n_embd_head_k_kda * n_head] = [4096] - layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT_B, i), {n_embd_head_k_kda * n_head}, 0); + layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_embd_head_k_kda * n_head}, 0); // g_a_proj, g_b_proj (output gate) layer.ssm_g_a = create_tensor(tn(LLM_TENSOR_SSM_G_A, "weight", i), {n_embd, n_embd_head_k_kda}, 0); @@ -6918,11 +6918,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp_actual}, TENSOR_NOT_REQUIRED); // exp_probs_b (e_score_correction_bias in vLLM) - // Try "bias" first (standard), then "weight" (for compatibility) - layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED); - if (!layer.ffn_exp_probs_b) { - layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "weight", i), {n_expert}, TENSOR_NOT_REQUIRED); - } + layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "weight", i), {n_expert}, 0); } } } break; diff --git a/src/llama-model.h b/src/llama-model.h index 40078dbdbd..a4900b093e 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -422,7 +422,6 @@ struct llama_layer { struct ggml_tensor * ssm_f_a = nullptr; struct ggml_tensor * ssm_f_b = nullptr; struct ggml_tensor * ssm_beta = nullptr; - struct ggml_tensor * ssm_a_log = nullptr; struct ggml_tensor * ssm_g_a = nullptr; struct ggml_tensor * ssm_g_b = nullptr; struct ggml_tensor * ssm_o_norm = nullptr; diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index 6db782641d..6013cd0b77 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -127,7 +127,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll // Check layer type by checking which tensors exist // KDA layers have ssm_a_log tensor, MLA layers have wkv_a_mqa tensor - bool is_kda = (layer.ssm_a_log != nullptr); + bool is_kda = (layer.ssm_a != nullptr); bool is_mla = (layer.wkv_a_mqa != nullptr); if (is_kda) { @@ -152,12 +152,10 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll g1 = ggml_softplus(ctx0, g1); g1 = ggml_reshape_3d(ctx0, g1, head_dim, n_head, n_tokens); - // A_log shape is [1, n_head] or [1, n_head, 1, 1], need to broadcast to [head_dim, n_head, n_tokens] - // First compute -exp(A_log), then reshape for broadcasting - ggml_tensor * A_neg_exp = ggml_neg(ctx0, ggml_exp(ctx0, layer.ssm_a_log)); + // A_log shape is [1, n_head] or [1, n_head, 1, 1], need to broadcast to [head_dim, n_head, n_tokens]. No need to -exp(a_log) because it was done in convert_hf_to_gguf.py // Reshape to [1, n_head, 1] for broadcasting with g1 [head_dim, n_head, n_tokens] - A_neg_exp = ggml_reshape_3d(ctx0, A_neg_exp, 1, n_head, 1); - g1 = ggml_mul(ctx0, g1, A_neg_exp); + ggml_tensor * A = ggml_reshape_3d(ctx0, layer.ssm_a, 1, n_head, 1); + g1 = ggml_mul(ctx0, g1, A); cb(g1, "kda_g1", il); // Compute beta (mixing coefficient) From f3d118d061e0630b3126df5b1a855d29813177df Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Sat, 17 Jan 2026 07:43:30 +0800 Subject: [PATCH 47/58] reverted to original --- ggml/src/ggml-cpu/ops.cpp | 111 ++++++++++++++++++++++---------------- 1 file changed, 64 insertions(+), 47 deletions(-) diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 42db45ee14..387e2fe42c 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -7,10 +7,9 @@ #include "unary-ops.h" #include "vec.h" -#include #include +#include #include -#include // ggml_compute_forward_dup @@ -7110,12 +7109,13 @@ void ggml_compute_forward_conv_2d_dw( } } -// ggml_compute_forward_pool_1d_sk_p0 - -static void ggml_compute_forward_pool_1d_sk_p0( +// ggml_compute_forward_pool_1d_ksp +static void ggml_compute_forward_pool_1d_ksp( const ggml_compute_params * params, const ggml_op_pool op, const int k, + const int s, + const int p, ggml_tensor * dst) { const ggml_tensor * src = dst->src[0]; @@ -7126,39 +7126,56 @@ static void ggml_compute_forward_pool_1d_sk_p0( return; } - const char * cdata = (const char *)src->data; - const char * const data_end = cdata + ggml_nbytes(src); - float * drow = (float *)dst->data; + const int64_t IW = src->ne[0]; + const int64_t OW = dst->ne[0]; - const int64_t rs = dst->ne[0]; + const int64_t nr = ggml_nrows(src); - while (cdata < data_end) { - const void * srow = (const void *)cdata; - int j = 0; - for (int64_t i = 0; i < rs; ++i) { + for (int64_t ir = 0; ir < nr; ++ir) { + const char * srow_bytes = (const char *) src->data + ir * src->nb[1]; + float * drow = (float *) (( char *) dst->data + ir * dst->nb[1]); + + for (int64_t ow = 0; ow < OW; ++ow) { + float res = 0; switch (op) { - case GGML_OP_POOL_AVG: drow[i] = 0; break; - case GGML_OP_POOL_MAX: drow[i] = -FLT_MAX; break; + case GGML_OP_POOL_AVG: res = 0.0f; break; + case GGML_OP_POOL_MAX: res = -FLT_MAX; break; case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); } + + int count = 0; + const int base = (int) ow * s - p; + for (int ki = 0; ki < k; ++ki) { - const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]); - switch (op) { - case GGML_OP_POOL_AVG: drow[i] += srow_j; break; - case GGML_OP_POOL_MAX: if (srow_j > drow[i]) drow[i] = srow_j; break; - case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); + const int j = base + ki; + if (j < 0 || j >= (int) IW) { + continue; } - ++j; + + float v; + if (src->type == GGML_TYPE_F32) { + v = ((const float *) srow_bytes)[j]; + } else { + v = GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t *) srow_bytes)[j]); + } + + switch (op) { + case GGML_OP_POOL_AVG: res += v; break; + case GGML_OP_POOL_MAX: res = std::max(v, res); break; + case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); + } + + ++count; } + switch (op) { - case GGML_OP_POOL_AVG: drow[i] /= k; break; - case GGML_OP_POOL_MAX: break; + case GGML_OP_POOL_AVG: res = (count > 0) ? (res / count) : 0.0f; break; + case GGML_OP_POOL_MAX: break; case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); } - } - cdata += src->nb[1]; - drow += rs; + drow[ow] = res; + } } } @@ -7173,10 +7190,8 @@ void ggml_compute_forward_pool_1d( const int k0 = opts[1]; const int s0 = opts[2]; const int p0 = opts[3]; - GGML_ASSERT(p0 == 0); // padding not supported - GGML_ASSERT(k0 == s0); // only s = k supported - ggml_compute_forward_pool_1d_sk_p0(params, op, k0, dst); + ggml_compute_forward_pool_1d_ksp(params, op, k0, s0, p0, dst); } // ggml_compute_forward_pool_2d @@ -7194,6 +7209,7 @@ void ggml_compute_forward_pool_2d( } const int32_t * opts = (const int32_t *)dst->op_params; + ggml_op_pool op = static_cast(opts[0]); const int k0 = opts[1]; const int k1 = opts[2]; @@ -7217,11 +7233,13 @@ void ggml_compute_forward_pool_2d( while (cdata < data_end) { for (int oy = 0; oy < py; ++oy) { float * const drow = dplane + oy * px; + float * const out = drow; + for (int ox = 0; ox < px; ++ox) { - float * const out = drow + ox; + float res = 0; switch (op) { - case GGML_OP_POOL_AVG: *out = 0; break; - case GGML_OP_POOL_MAX: *out = -FLT_MAX; break; + case GGML_OP_POOL_AVG: res = 0; break; + case GGML_OP_POOL_MAX: res = -FLT_MAX; break; case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); } @@ -7229,24 +7247,32 @@ void ggml_compute_forward_pool_2d( const int iy = offset1 + oy * s1; for (int ky = 0; ky < k1; ++ky) { - if (iy + ky < 0 || iy + ky >= src->ne[1]) continue; + if (iy + ky < 0 || iy + ky >= src->ne[1]) { + continue; + } + const void * srow = (const void *)(cdata + src->nb[1] * (iy + ky)); for (int kx = 0; kx < k0; ++kx) { int j = ix + kx; - if (j < 0 || j >= src->ne[0]) continue; + if (j < 0 || j >= src->ne[0]) { + continue; + } + const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]); switch (op) { - case GGML_OP_POOL_AVG: *out += srow_j; break; - case GGML_OP_POOL_MAX: if (srow_j > *out) *out = srow_j; break; + case GGML_OP_POOL_AVG: res += srow_j; break; + case GGML_OP_POOL_MAX: res = std::max(srow_j, res); break; case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); } } } switch (op) { - case GGML_OP_POOL_AVG: *out /= ka; break; - case GGML_OP_POOL_MAX: break; + case GGML_OP_POOL_AVG: res /= ka; break; + case GGML_OP_POOL_MAX: break; case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); } + + out[ox] = res; } } @@ -8713,8 +8739,6 @@ static void ggml_compute_forward_ssm_conv_f32( const int ir1 = MIN(ir0 + dr, nr); const int ir = ir1 - ir0; - bool do_conv_debug = false; // (ith == 0 && conv_debug_count++ < 3); - for (int i3 = 0; i3 < n_s; ++i3) { for (int i2 = 0; i2 < n_t; ++i2) { // {d_conv - 1 + n_t, d_inner, n_seqs} @@ -8735,13 +8759,6 @@ static void ggml_compute_forward_ssm_conv_f32( sumf += s[i0 + i1*ncs] * c[i0 + i1*nc]; } x[i1] = sumf; - - // Debug output - if (do_conv_debug && i1 == 0 && i2 == 0 && i3 == 0) { - fprintf(stderr, "DEBUG SSM_CONV: nc=%d, nr=%d, n_t=%d, n_s=%d\n", nc, nr, n_t, n_s); - fprintf(stderr, "DEBUG SSM_CONV: s[0..3]=%f,%f,%f,%f, c[0..3]=%f,%f,%f,%f, x[0]=%f\n", - s[0], s[1], s[2], s[3], c[0], c[1], c[2], c[3], x[0]); - } } } } From 560190af9742d2dfb6158e1df2524ace0fd3e282 Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Wed, 21 Jan 2026 22:12:21 +0800 Subject: [PATCH 48/58] fixed find_hparam calls. Fixed e_score_correction_bias to use bias instead of weight. Removed all ssm_conv bias terms. --- convert_hf_to_gguf.py | 37 +++++++++++----------------------- gguf-py/gguf/tensor_mapping.py | 1 + src/llama-model.cpp | 10 ++++----- src/llama-model.h | 3 --- src/models/kimi-linear.cpp | 11 ++++------ 5 files changed, 21 insertions(+), 41 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 2e80889215..ed650e1246 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -5186,21 +5186,16 @@ class KimiLinearModel(TextModel): assert len(_num_kv_heads) == self.hparams["num_hidden_layers"] self.gguf_writer.add_head_count_kv(_num_kv_heads) - ssm_d_conv = self.hparams.get("ssm_d_conv") or linear_attn_config.get("short_conv_kernel_size") - if ssm_d_conv is not None: + if (ssm_d_conv := linear_attn_config.get("short_conv_kernel_size")) is not None: self.gguf_writer.add_ssm_conv_kernel(ssm_d_conv) - kda_head_dim = self.hparams.get("kda_head_dim") or linear_attn_config.get("head_dim") - if kda_head_dim is not None: + if (kda_head_dim := linear_attn_config.get("head_dim")) is not None: self.gguf_writer.add_kda_head_dim(kda_head_dim) # MLA params - use add_* methods that handle arch substitution # Support both HuggingFace naming (q_lora_rank, kv_lora_rank) and internal naming (n_lora_q, n_lora_kv) - q_lora_rank = self.hparams.get("q_lora_rank", self.hparams.get("n_lora_q")) - kv_lora_rank = self.hparams.get("kv_lora_rank", self.hparams.get("n_lora_kv")) - - if q_lora_rank is not None: + if (q_lora_rank := self.find_hparam(["q_lora_rank", "n_lora_q"], optional=False)) is not None: self.gguf_writer.add_q_lora_rank(q_lora_rank) - if kv_lora_rank is not None: + if (kv_lora_rank := self.find_hparam(["kv_lora_rank", "n_lora_kv"], optional=False)) is not None: self.gguf_writer.add_kv_lora_rank(kv_lora_rank) # MLA head dimensions @@ -5226,39 +5221,32 @@ class KimiLinearModel(TextModel): self.gguf_writer.add_value_length_mla(v_head_dim) # Rotation - use qk_rope_head_dim for Kimi - rope_dim = self.find_hparam(["qk_rope_head_dim", "n_rot"]) - if rope_dim is not None: + if (rope_dim := self.find_hparam(["qk_rope_head_dim", "n_rot"], optional=True)) is not None: self.gguf_writer.add_rope_dimension_count(rope_dim) else: # Default to head_dim head_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(head_dim) - n_experts = self.find_hparam(["num_experts"]) - if n_experts is not None: + if (n_experts := self.find_hparam(["num_experts"], optional=False)) is not None: self.gguf_writer.add_expert_count(n_experts) - n_experts_used = self.find_hparam(["num_experts_per_token"]) - if n_experts_used is not None: + if (n_experts_used := self.find_hparam(["num_experts_per_token"], optional=False)) is not None: self.gguf_writer.add_expert_used_count(n_experts_used) # moe_intermediate_size (1024 for Kimi) - moe_intermediate_size = self.find_hparam(["moe_intermediate_size"]) - if moe_intermediate_size is not None: + if (moe_intermediate_size := self.find_hparam(["moe_intermediate_size"], optional=False)) is not None: self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) # num_shared_experts (1 for Kimi) - num_shared_experts = self.find_hparam(["num_shared_experts"]) - if num_shared_experts is not None: + if (num_shared_experts := self.find_hparam(["num_shared_experts"], optional=False)) is not None: self.gguf_writer.add_expert_shared_count(num_shared_experts) # first_k_dense_replace (1 for Kimi - first layer uses dense MLP) - first_k_dense_replace = self.find_hparam(["first_k_dense_replace"]) - if first_k_dense_replace is not None: + if (first_k_dense_replace := self.find_hparam(["first_k_dense_replace"])) is not None: self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace) # Routed scaling factor (expert_weights_scale = 2.446 for Kimi) - routed_scaling_factor = self.find_hparam(["routed_scaling_factor"]) - if routed_scaling_factor is not None: + if (routed_scaling_factor := self.find_hparam(["routed_scaling_factor"], optional=False)) is not None: self.gguf_writer.add_expert_weights_scale(routed_scaling_factor) def prepare_tensors(self): @@ -5292,8 +5280,7 @@ class KimiLinearModel(TextModel): # Kimi specific bias if name.endswith("e_score_correction_bias"): - new_name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_EXP_PROBS_B, bid) - return [(new_name, data_torch)] + name = name.replace("e_score_correction_bias", "e_score_correction.bias") # Handle A_log: iHF stores as [1, 1, num_heads, 1] # llama.cpp expects ggml ne = [1, num_heads, 1, 1] diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index d96119ebe9..e16c06c2a3 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -438,6 +438,7 @@ class TensorNameMap: "model.layers.{bid}.block_sparse_moe.e_score_correction", # minimax-m2 "backbone.layers.{bid}.mixer.gate.e_score_correction", # nemotron-h-moe "model.layers.{bid}.mlp.e_score_correction", # exaone-moe + "model.layers.{bid}.block_sparse_moe.gate.e_score_correction", # kimi ), # Feed-forward up diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 7195346fd8..4ea23dca53 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -6825,11 +6825,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ssm_v_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_V, "weight", i), {ssm_d_conv, 1, n_embd_head_v_kda * n_head}, 0); } - // Conv bias may not exist in all models - make optional - layer.ssm_q_conv_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "bias", i), {n_embd_head_k_kda * n_head}, TENSOR_NOT_REQUIRED); - layer.ssm_k_conv_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_K, "bias", i), {n_embd_head_k_kda * n_head}, TENSOR_NOT_REQUIRED); - layer.ssm_v_conv_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_V, "bias", i), {n_embd_head_v_kda * n_head}, TENSOR_NOT_REQUIRED); - // q, k, v projections // Python: q_proj, k_proj, v_proj layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k_kda * n_head}, 0); @@ -6923,7 +6918,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp_actual}, TENSOR_NOT_REQUIRED); // exp_probs_b (e_score_correction_bias in vLLM) - layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "weight", i), {n_expert}, 0); + layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED); + if (!layer.ffn_exp_probs_b) { + layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "weight", i), {n_expert}, TENSOR_NOT_REQUIRED); + } } } } break; diff --git a/src/llama-model.h b/src/llama-model.h index 208766bacf..359701589c 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -415,11 +415,8 @@ struct llama_layer { // Kimi Linear KDA (using ssm_ prefix for consistency) // Note: ssm_dt_b already exists above (mamba bias), reused for Kimi dt_bias struct ggml_tensor * ssm_q_conv = nullptr; - struct ggml_tensor * ssm_q_conv_b = nullptr; struct ggml_tensor * ssm_k_conv = nullptr; - struct ggml_tensor * ssm_k_conv_b = nullptr; struct ggml_tensor * ssm_v_conv = nullptr; - struct ggml_tensor * ssm_v_conv_b = nullptr; struct ggml_tensor * ssm_f_a = nullptr; struct ggml_tensor * ssm_f_b = nullptr; struct ggml_tensor * ssm_beta = nullptr; diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index 6013cd0b77..721bef9e7f 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -5,7 +5,7 @@ // Causal Conv1d function for Q,K,V // When qkv is 0, it is Q, 1 is K, 2 is V -static ggml_tensor * causal_conv1d(ggml_cgraph * gf, ggml_context * ctx0, ggml_tensor * conv_states_all, ggml_tensor * conv_state_all, int64_t qkv, ggml_tensor * x, ggml_tensor * proj_w, ggml_tensor * conv_w, ggml_tensor * conv_b, int64_t d_conv, int64_t head_dim, int64_t n_head, int64_t n_seq_tokens, int64_t n_seqs, int64_t n_tokens, int64_t kv_head) { +static ggml_tensor * causal_conv1d(ggml_cgraph * gf, ggml_context * ctx0, ggml_tensor * conv_states_all, ggml_tensor * conv_state_all, int64_t qkv, ggml_tensor * x, ggml_tensor * proj_w, ggml_tensor * conv_w, int64_t d_conv, int64_t head_dim, int64_t n_head, int64_t n_seq_tokens, int64_t n_seqs, int64_t n_tokens, int64_t kv_head) { const int64_t d_inner = head_dim * n_head; const int64_t conv_state_size = (d_conv - 1) * d_inner; const int64_t n_embd_r_total = 3 * conv_state_size; // Q + K + V @@ -56,9 +56,6 @@ static ggml_tensor * causal_conv1d(ggml_cgraph * gf, ggml_context * ctx0, ggml_t ggml_tensor * Xcur = ggml_ssm_conv(ctx0, conv_x, conv_weight); // Reshape to 2D for bias add: {d_inner, n_tokens} Xcur = ggml_reshape_2d(ctx0, Xcur, d_inner, n_tokens); - if (conv_b) { - Xcur = ggml_add(ctx0, Xcur, conv_b); - } Xcur = ggml_silu(ctx0, Xcur); return ggml_reshape_4d(ctx0, Xcur, head_dim, n_head, n_seq_tokens, n_seqs); @@ -140,9 +137,9 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll ggml_tensor * conv_states_all = mctx_cur->get_r_l(il); cb(conv_states_all, "conv_states_all", il); ggml_tensor * conv_state_all = build_rs(inp_rs, conv_states_all, hparams.n_embd_r(), n_seqs); - ggml_tensor * Qcur = causal_conv1d(gf, ctx0, conv_states_all, conv_state_all, 0, cur, layer.wq, layer.ssm_q_conv, layer.ssm_q_conv_b, d_conv, head_dim, n_head, n_seq_tokens, n_seqs, n_tokens, kv_head); - ggml_tensor * Kcur = causal_conv1d(gf, ctx0, conv_states_all, conv_state_all, 1, cur, layer.wk, layer.ssm_k_conv, layer.ssm_k_conv_b, d_conv, head_dim, n_head, n_seq_tokens, n_seqs, n_tokens, kv_head); - ggml_tensor * Vcur = causal_conv1d(gf, ctx0, conv_states_all, conv_state_all, 2, cur, layer.wv, layer.ssm_v_conv, layer.ssm_v_conv_b, d_conv, head_dim, n_head, n_seq_tokens, n_seqs, n_tokens, kv_head); + ggml_tensor * Qcur = causal_conv1d(gf, ctx0, conv_states_all, conv_state_all, 0, cur, layer.wq, layer.ssm_q_conv, d_conv, head_dim, n_head, n_seq_tokens, n_seqs, n_tokens, kv_head); + ggml_tensor * Kcur = causal_conv1d(gf, ctx0, conv_states_all, conv_state_all, 1, cur, layer.wk, layer.ssm_k_conv, d_conv, head_dim, n_head, n_seq_tokens, n_seqs, n_tokens, kv_head); + ggml_tensor * Vcur = causal_conv1d(gf, ctx0, conv_states_all, conv_state_all, 2, cur, layer.wv, layer.ssm_v_conv, d_conv, head_dim, n_head, n_seq_tokens, n_seqs, n_tokens, kv_head); // g1 = -exp(A_log) * softplus(f_b(f_a(x)) + dt_bias) ggml_tensor * f_a = ggml_mul_mat(ctx0, layer.ssm_f_a, cur); From ae8d710c39886e12422fd564ecd0a278057cbdcd Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Thu, 22 Jan 2026 07:06:17 +0800 Subject: [PATCH 49/58] remove DT_B from constants.py. remove one comment line in llama-model.cpp --- gguf-py/gguf/constants.py | 1 - src/llama-model.cpp | 1 - 2 files changed, 2 deletions(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 73e7bae6e1..a51b3d87d1 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -559,7 +559,6 @@ class MODEL_TENSOR(IntEnum): SSM_F_A = auto() # Kimi Linear SSM_F_B = auto() # Kimi Linear SSM_BETA = auto() # Kimi Linear - SSM_DT_B = auto() # Kimi Linear SSM_G_A = auto() # Kimi Linear SSM_G_B = auto() # Kimi Linear TIME_MIX_W0 = auto() diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 4ea23dca53..edf7108fd7 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -6917,7 +6917,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp_actual, n_embd}, TENSOR_NOT_REQUIRED); layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp_actual}, TENSOR_NOT_REQUIRED); - // exp_probs_b (e_score_correction_bias in vLLM) layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED); if (!layer.ffn_exp_probs_b) { layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "weight", i), {n_expert}, TENSOR_NOT_REQUIRED); From f1525b36959e24776c57031f00fce0212cc3eff8 Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Tue, 27 Jan 2026 11:25:13 +0800 Subject: [PATCH 50/58] new class llm_graph_input_mem_hybrid_k to get around the new MLA change. switch the concat order of ggml_concat calls in kimi-linear.cpp to accommodate MLA changes. Removed support for exp_probs_b.weight --- src/llama-graph.cpp | 52 ++++++++++++++++++++++++++++++++++++++ src/llama-graph.h | 29 +++++++++++++++++++++ src/llama-model.cpp | 30 ++++++++++------------ src/models/kimi-linear.cpp | 12 ++++----- 4 files changed, 100 insertions(+), 23 deletions(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 06d0d4c558..1aebc012a1 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -533,6 +533,47 @@ bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) { return res; } +void llm_graph_input_mem_hybrid_k::set_input(const llama_ubatch * ubatch) { + mctx->get_attn()->set_input_k_idxs(inp_attn->self_k_idxs, ubatch); + + mctx->get_attn()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn); + + const int64_t n_rs = mctx->get_recr()->get_n_rs(); + + if (inp_rs->s_copy) { + GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer)); + int32_t * data = (int32_t *) inp_rs->s_copy->data; + + // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n + for (uint32_t i = 0; i < n_rs; ++i) { + data[i] = mctx->get_recr()->s_copy(i); + } + } +} + +bool llm_graph_input_mem_hybrid_k::can_reuse(const llm_graph_params & params) { + const auto * mctx = static_cast(params.mctx); + + this->mctx = mctx; + + bool res = true; + + res &= inp_attn->self_k_idxs->ne[0] == params.ubatch.n_tokens; + + res &= inp_attn->self_kq_mask->ne[0] == mctx->get_attn()->get_n_kv(); + res &= inp_attn->self_kq_mask->ne[1] == params.ubatch.n_tokens; + + res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs(); + + res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs; + res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs; + + res &= inp_rs->head == mctx->get_recr()->get_head(); + res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z(); + + return res; +} + void llm_graph_input_mem_hybrid_iswa::set_input(const llama_ubatch * ubatch) { const auto * attn_ctx = mctx->get_attn(); @@ -2272,6 +2313,17 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const { return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp)); } +llm_graph_input_mem_hybrid_k * llm_graph_context::build_inp_mem_hybrid_k() const { + const auto * mctx_cur = static_cast(mctx); + + auto inp_rs = build_rs_inp_impl (ctx0, ubatch, mctx_cur->get_recr()); + auto inp_attn = build_attn_inp_k_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn()); + + auto inp = std::make_unique(cparams, std::move(inp_attn), std::move(inp_rs), mctx_cur); + + return (llm_graph_input_mem_hybrid_k *) res->add_input(std::move(inp)); +} + llm_graph_input_mem_hybrid_iswa * llm_graph_context::build_inp_mem_hybrid_iswa() const { const auto * mctx_cur = static_cast(mctx); diff --git a/src/llama-graph.h b/src/llama-graph.h index 4090d8116c..1d69ff1a6f 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -433,6 +433,34 @@ public: const llama_memory_hybrid_context * mctx; }; +class llm_graph_input_mem_hybrid_k : public llm_graph_input_i { +public: + llm_graph_input_mem_hybrid_k( + const llama_cparams & cparams, + std::unique_ptr inp_attn, + std::unique_ptr inp_rs, + const llama_memory_hybrid_context * mctx) : + inp_attn(std::move(inp_attn)), + inp_rs(std::move(inp_rs)), + cparams(cparams), + mctx(mctx) { } + virtual ~llm_graph_input_mem_hybrid_k() = default; + + void set_input(const llama_ubatch * ubatch) override; + + bool can_reuse(const llm_graph_params & params) override; + + std::unique_ptr inp_attn; + std::unique_ptr inp_rs; + + llm_graph_input_attn_k * get_attn() const { return inp_attn.get(); } + llm_graph_input_rs * get_recr() const { return inp_rs.get(); } + + const llama_cparams cparams; + + const llama_memory_hybrid_context * mctx; +}; + class llm_graph_input_mem_hybrid_iswa : public llm_graph_input_i { public: llm_graph_input_mem_hybrid_iswa( @@ -960,6 +988,7 @@ struct llm_graph_context { // llm_graph_input_mem_hybrid * build_inp_mem_hybrid() const; + llm_graph_input_mem_hybrid_k * build_inp_mem_hybrid_k() const; llm_graph_input_mem_hybrid_iswa * build_inp_mem_hybrid_iswa() const; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 67b0314de9..84ac4d3a9e 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2454,12 +2454,12 @@ void llama_model::load_hparams(llama_model_loader & ml) { case LLM_ARCH_KIMI_LINEAR: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla, false); - ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false); - ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv, false); - ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false); - ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv, false); - ml.get_key(LLM_KV_KDA_HEAD_DIM, hparams.kda_head_dim, false); + ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla_impl); + ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl); + ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv); + ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot); + ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); + ml.get_key(LLM_KV_KDA_HEAD_DIM, hparams.kda_head_dim); // MLA qk_rope_head_dim (for reference) // qk_rope_head_dim = 64, qk_nope_head_dim = 128, qk_head_dim = 192 @@ -2471,11 +2471,10 @@ void llama_model::load_hparams(llama_model_loader & ml) { } // MoE parameters - Kimi uses moe_intermediate_size = 1024 - ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); - ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false); - ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false); - ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false); - ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); + ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared); + ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead); + ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale); ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func); switch (hparams.n_layer) { @@ -6863,8 +6862,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // MLA Layer - use MLA-specific head dimensions const int64_t q_lora_rank = hparams.n_lora_q; const int64_t kv_lora_rank = hparams.n_lora_kv; - const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla; - const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla; + const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla(); + const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla(); layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, TENSOR_NOT_REQUIRED); layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0); @@ -6917,10 +6916,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp_actual, n_embd}, TENSOR_NOT_REQUIRED); layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp_actual}, TENSOR_NOT_REQUIRED); - layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED); - if (!layer.ffn_exp_probs_b) { - layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "weight", i), {n_expert}, TENSOR_NOT_REQUIRED); - } + layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0); } } } break; diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index 721bef9e7f..3ea404dd0b 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -72,7 +72,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll // Note: Kimi MLA does NOT use RoPE (rotary_emb=None in vLLM) // So we don't need inp_pos - auto * inp = build_inp_mem_hybrid(); + auto * inp = build_inp_mem_hybrid_k(); auto * inp_rs = inp->get_recr(); auto * inp_attn = inp->get_attn(); @@ -104,8 +104,8 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); // MLA params - const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla; - const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla; + const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla(); + const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla(); const int64_t kv_lora_rank = hparams.n_lora_kv; // qk_rope_head_dim = 64 (from Kimi config) which is hparams.n_rot // Confirmed from tensor shape: wkv_a_mqa [2304, 576] = [n_embd, kv_lora_rank + qk_rope_head_dim] @@ -258,14 +258,14 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens} // note: rope must go first for in-place context shifting in build_rope_shift() - Qcur = ggml_concat(ctx0, q_pe, q_nope_absorbed, 0); + Qcur = ggml_concat(ctx0, q_nope_absorbed, q_pe, 0); cb(Qcur, "Qcur", il); kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens); cb(kv_cmpr, "kv_cmpr_reshape", il); // {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens} - ggml_tensor * Kcur = ggml_concat(ctx0, k_pe, kv_cmpr, 0); + ggml_tensor * Kcur = ggml_concat(ctx0, kv_cmpr, k_pe, 0); cb(Kcur, "Kcur", il); // {kv_lora_rank, 1, n_tokens} @@ -299,7 +299,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll // Need to broadcast k_pe from [qk_rope, 1, n_tokens] to [qk_rope, n_head, n_tokens] ggml_tensor * k_pe_target = ggml_new_tensor_3d(ctx0, k_pe->type, n_embd_head_qk_rope, n_head, n_tokens); ggml_tensor * k_pe_repeated = ggml_repeat(ctx0, k_pe, k_pe_target); - ggml_tensor * Kcur = ggml_concat(ctx0, k_nope, k_pe_repeated, 0); + ggml_tensor * Kcur = ggml_concat(ctx0, k_pe_repeated, k_nope, 0); cb(Kcur, "mla_K", il); // Direct softmax attention (with MHA KV cache) From 0de4680bdfdb3603b78b23d69b789d0a14547155 Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Tue, 27 Jan 2026 13:19:06 +0800 Subject: [PATCH 51/58] remove ssm_o_norm_b --- src/llama-model.cpp | 1 - src/llama-model.h | 1 - 2 files changed, 2 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 84ac4d3a9e..50900feb2c 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -6853,7 +6853,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // o_norm (reusing SSM_NORM) layer.ssm_o_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {n_embd_head_k_kda}, 0); // FusedRMSNormGated - layer.ssm_o_norm_b = create_tensor(tn(LLM_TENSOR_SSM_NORM, "bias", i), {n_embd_head_k_kda}, TENSOR_NOT_REQUIRED); // o_proj layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v_kda * n_head, n_embd}, 0); diff --git a/src/llama-model.h b/src/llama-model.h index 359701589c..5b408bcea2 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -423,7 +423,6 @@ struct llama_layer { struct ggml_tensor * ssm_g_a = nullptr; struct ggml_tensor * ssm_g_b = nullptr; struct ggml_tensor * ssm_o_norm = nullptr; - struct ggml_tensor * ssm_o_norm_b = nullptr; struct llama_layer_posnet posnet; From 0444a4faa0660636a3246982b951104bfdb1df1e Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Tue, 27 Jan 2026 13:19:55 +0800 Subject: [PATCH 52/58] remove ssm_o_norm_b --- src/models/kimi-linear.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index 3ea404dd0b..40007a6fa3 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -197,7 +197,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll // Note: Kimi model uses sigmoid gating, not SiLU (despite FusedRMSNormGated default being swish) // Formula: output = RMSNorm(x) * sigmoid(g) ggml_tensor * attn_out_final = ggml_reshape_3d(ctx0, output, head_dim, n_head, n_seq_tokens * n_seqs); - ggml_tensor * normed = build_norm(attn_out_final, layer.ssm_o_norm, layer.ssm_o_norm_b, LLM_NORM_RMS, il); + ggml_tensor * normed = build_norm(attn_out_final, layer.ssm_o_norm, nullptr, LLM_NORM_RMS, il); cb(normed, "kda_normed", il); ggml_tensor * gate = ggml_sigmoid(ctx0, g2); ggml_tensor * gated = ggml_mul(ctx0, normed, gate); From a6b2c450c8ea9bdf9995a3e5443512dd5d3c96c4 Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Thu, 29 Jan 2026 08:35:35 +0800 Subject: [PATCH 53/58] changed hparams.kda_head_dim to hparams.n_embd_head_kda. added TODO comment for class llama_graph_mem_hybrid_k --- src/llama-graph.cpp | 3 +++ src/llama-hparams.cpp | 8 ++++---- src/llama-hparams.h | 2 +- src/llama-model.cpp | 6 +++--- src/models/kimi-linear.cpp | 2 +- 5 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 1aebc012a1..ac143bf031 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -533,6 +533,9 @@ bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) { return res; } +// TODO: Hybrid input classes are a bit redundant. +// Instead of creating a hybrid input, the graph can simply create 2 separate inputs. +// Refactoring is required in the future. void llm_graph_input_mem_hybrid_k::set_input(const llama_ubatch * ubatch) { mctx->get_attn()->set_input_k_idxs(inp_attn->self_k_idxs, ubatch); diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 873c65cea8..756dda1a7a 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -139,10 +139,10 @@ uint32_t llama_hparams::n_embd_r() const { return n_embd * (n_shortconv_l_cache - 1); } - if (kda_head_dim != 0) { + if (n_embd_head_kda != 0) { // for Kimi KDA layers // Conv state for Q, K, V: 3 * (d_conv - 1) * n_head * head_dim - const uint32_t d_inner = n_head() * kda_head_dim; // 32 * 128 = 4096 + const uint32_t d_inner = n_head() * n_embd_head_kda; // 32 * 128 = 4096 return 3 * (ssm_d_conv > 0 ? ssm_d_conv - 1 : 3) * d_inner; } @@ -158,11 +158,11 @@ uint32_t llama_hparams::n_embd_s() const { return n_embd * wkv_head_size; } - if (kda_head_dim != 0) { + if (n_embd_head_kda != 0) { // for Kimi KDA layers // Full recurrent state: head_dim * head_dim * n_head // h tensor shape for delta attention: [head_dim, head_dim, n_head] - return kda_head_dim * kda_head_dim * n_head(); // 128 * 128 * 32 = 524288 + return n_embd_head_kda * n_embd_head_kda * n_head(); // 128 * 128 * 32 = 524288 } // corresponds to Mamba's ssm_states size diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 943161747c..a736ccc3d0 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -138,7 +138,7 @@ struct llama_hparams { uint32_t ssm_n_group = 0; // for Kimi Linear KDA - uint32_t kda_head_dim = 0; + uint32_t n_embd_head_kda = 0; // for hybrid state space models std::array recurrent_layer_arr; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 50900feb2c..40f3ff6e49 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2459,7 +2459,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv); ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot); ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); - ml.get_key(LLM_KV_KDA_HEAD_DIM, hparams.kda_head_dim); + ml.get_key(LLM_KV_KDA_HEAD_DIM, hparams.n_embd_head_kda); // MLA qk_rope_head_dim (for reference) // qk_rope_head_dim = 64, qk_nope_head_dim = 128, qk_head_dim = 192 @@ -6801,8 +6801,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // Assuming KDA layer if KDA tensors are present // KDA uses head_dim = 128 (from linear_attn_config.head_dim) - const int64_t n_embd_head_k_kda = hparams.kda_head_dim; - const int64_t n_embd_head_v_kda = hparams.kda_head_dim; + const int64_t n_embd_head_k_kda = hparams.n_embd_head_kda; + const int64_t n_embd_head_v_kda = hparams.n_embd_head_kda; const int64_t ssm_d_conv = hparams.ssm_d_conv; // Try loading KDA specific tensors (using SSM_ prefix) diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index 40007a6fa3..5f497722d0 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -92,7 +92,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll // Kimi dimension constants const int64_t n_head = hparams.n_head(); - const int64_t head_dim = hparams.kda_head_dim; + const int64_t head_dim = hparams.n_embd_head_kda; const int64_t d_conv = hparams.ssm_d_conv; const int64_t d_inner = n_head * head_dim; // 32 * 128 = 4096 const int64_t n_seqs = ubatch.n_seqs; From 6216273edefbdd393ad8f986d4d86613e9886c7f Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Thu, 29 Jan 2026 08:46:33 +0800 Subject: [PATCH 54/58] removed all ggml_cont b4 ggml_reshape_4d --- src/models/kimi-linear.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index 5f497722d0..a7e5482008 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -540,9 +540,9 @@ std::pair llm_build_kimi_linear::build_kda_chunkin // decay_mask [S_k,BT_j,BT_i,CHB] *Note* second and third chunk_sizes are switched decay_mask = ggml_cont_4d(ctx0, ggml_permute(ctx0, decay_mask, 2, 1, 0, 3), S_k, chunk_size, chunk_size, CHB); - ggml_tensor * k_i = ggml_cont(ctx0, ggml_reshape_4d(ctx0, k, S_k, chunk_size, 1, CHB)); - ggml_tensor * k_j = ggml_cont(ctx0, ggml_reshape_4d(ctx0, k, S_k, 1, chunk_size, CHB)); - ggml_tensor * q_i = ggml_cont(ctx0, ggml_reshape_4d(ctx0, q, S_k, chunk_size, 1, CHB)); + ggml_tensor * k_i = ggml_reshape_4d(ctx0, k, S_k, chunk_size, 1, CHB); + ggml_tensor * k_j = ggml_reshape_4d(ctx0, k, S_k, 1, chunk_size, CHB); + ggml_tensor * q_i = ggml_reshape_4d(ctx0, q, S_k, chunk_size, 1, CHB); ggml_tensor * decay_k_i = ggml_mul(ctx0, decay_mask, k_i); ggml_tensor * decay_q_i = ggml_mul(ctx0, decay_mask, q_i); From 005c34067ae7437e9a8e536281afb42206c9795f Mon Sep 17 00:00:00 2001 From: "Piotr Wilkin (ilintar)" Date: Fri, 30 Jan 2026 13:38:45 +0100 Subject: [PATCH 55/58] Whitespace --- src/llama-graph.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index ac143bf031..bd16ffd113 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -533,7 +533,7 @@ bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) { return res; } -// TODO: Hybrid input classes are a bit redundant. +// TODO: Hybrid input classes are a bit redundant. // Instead of creating a hybrid input, the graph can simply create 2 separate inputs. // Refactoring is required in the future. void llm_graph_input_mem_hybrid_k::set_input(const llama_ubatch * ubatch) { From aaf05bddc7522cdb08eae655117cb78ebf63d35a Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Sat, 31 Jan 2026 08:46:19 +0800 Subject: [PATCH 56/58] replaced all hparams.get with find_hparams --- convert_hf_to_gguf.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index a0681d4889..a1b4401198 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -5096,7 +5096,7 @@ class KimiLinearModel(TextModel): # KDA & MLA params # Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv - linear_attn_config = self.hparams.get("linear_attn_config", {}) + linear_attn_config = self.find_hparam(["linear_attn_config"], optional=False) # n_head == 0 for KDA layers, n_head > 0 for MLA layers # full_attention_layers list will be used to distingush layer type _num_kv_heads = list() @@ -5123,23 +5123,24 @@ class KimiLinearModel(TextModel): # MLA head dimensions # Support HuggingFace naming: qk_nope_head_dim, qk_rope_head_dim, v_head_dim - qk_nope_head_dim = self.hparams.get("qk_nope_head_dim") - qk_rope_head_dim = self.hparams.get("qk_rope_head_dim") - v_head_dim = self.hparams.get("v_head_dim") + qk_nope_head_dim = self.find_hparam(["qk_nope_head_dim"], optional=False) + qk_rope_head_dim = self.find_hparam(["qk_rope_head_dim"], optional=False) + v_head_dim = self.find_hparam(["v_head_dim"], optional=False) + kv_lora_rank = self.find_hparam(["kv_lora_rank"], optional=False) # To enable MLA KV cache, MLA needs to be converted into MQA with larger heads, then decompresses to MHA - self.gguf_writer.add_key_length(self.hparams["kv_lora_rank"] + self.hparams["qk_rope_head_dim"]) - self.gguf_writer.add_value_length(self.hparams["kv_lora_rank"]) + self.gguf_writer.add_key_length(kv_lora_rank + qk_rope_head_dim) + self.gguf_writer.add_value_length(kv_lora_rank) # Calculate n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim - if "n_embd_head_k_mla" in self.hparams: - self.gguf_writer.add_key_length_mla(self.hparams["n_embd_head_k_mla"]) + if (n_embd_head_k_mla := self.find_hparam(["n_embd_head_k_mla"], optional=True)) is not None: + self.gguf_writer.add_key_length_mla(n_embd_head_k_mla) elif qk_nope_head_dim is not None and qk_rope_head_dim is not None: n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim self.gguf_writer.add_key_length_mla(n_embd_head_k_mla) # n_embd_head_v_mla = v_head_dim - if "n_embd_head_v_mla" in self.hparams: - self.gguf_writer.add_value_length_mla(self.hparams["n_embd_head_v_mla"]) + if (n_embd_head_v_mla := self.find_hparam(["n_embd_head_v_mla"], optional=True)) is not None: + self.gguf_writer.add_value_length_mla(n_embd_head_v_mla) elif v_head_dim is not None: self.gguf_writer.add_value_length_mla(v_head_dim) @@ -5216,7 +5217,7 @@ class KimiLinearModel(TextModel): # process the experts separately if name.find("block_sparse_moe.experts") != -1: - n_experts = self.hparams.get("num_local_experts", self.hparams.get("num_experts")) + n_experts = self.find_hparam(["num_experts"], optional=False) assert bid is not None if self._experts is None: From 2c8cd844d0c4d8a1a64403dab4f0017acd23ba06 Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Sun, 1 Feb 2026 08:42:01 +0800 Subject: [PATCH 57/58] added new names for n_experts, n_experts_used and score_func in TextModel and removed their code in KimiLinear in convert_hf_to_gguf.py. Removed unnecessary ggml_cont and GGML_ASSERT in kimi-linear.cpp --- convert_hf_to_gguf.py | 34 +++++++++------------------------- src/models/kimi-linear.cpp | 13 +------------ 2 files changed, 10 insertions(+), 37 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index a1b4401198..08e4a12e45 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -907,10 +907,10 @@ class TextModel(ModelBase): if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None: self.gguf_writer.add_layer_norm_eps(f_norm_eps) logger.info(f"gguf: layer norm epsilon = {f_norm_eps}") - if (n_experts := self.hparams.get("num_local_experts")) is not None: + if (n_experts := self.find_hparam(["num_local_experts", "num_experts"], optional=True)) is not None: self.gguf_writer.add_expert_count(n_experts) logger.info(f"gguf: expert count = {n_experts}") - if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: + if (n_experts_used := self.find_hparam(["num_experts_per_tok", "num_experts_per_token"], optional=True)) is not None: self.gguf_writer.add_expert_used_count(n_experts_used) logger.info(f"gguf: experts used count = {n_experts_used}") if (n_expert_groups := self.hparams.get("n_group")) is not None: @@ -920,7 +920,7 @@ class TextModel(ModelBase): self.gguf_writer.add_expert_group_used_count(n_group_used) logger.info(f"gguf: expert groups used count = {n_group_used}") - if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func"], optional=True)) is not None: + if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func", "moe_router_activation_func"], optional=True)) is not None: if score_func == "sigmoid": self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) elif score_func == "softmax": @@ -5086,14 +5086,6 @@ class KimiLinearModel(TextModel): super().set_gguf_parameters() self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) - if (score_func := self.find_hparam(["moe_router_activation_func"], optional=True)) is not None: - if score_func == "sigmoid": - self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) - elif score_func == "softmax": - self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX) - else: - raise ValueError(f"Unsupported expert score gating function value: {score_func}") - # KDA & MLA params # Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv linear_attn_config = self.find_hparam(["linear_attn_config"], optional=False) @@ -5152,11 +5144,6 @@ class KimiLinearModel(TextModel): head_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(head_dim) - if (n_experts := self.find_hparam(["num_experts"], optional=False)) is not None: - self.gguf_writer.add_expert_count(n_experts) - if (n_experts_used := self.find_hparam(["num_experts_per_token"], optional=False)) is not None: - self.gguf_writer.add_expert_used_count(n_experts_used) - # moe_intermediate_size (1024 for Kimi) if (moe_intermediate_size := self.find_hparam(["moe_intermediate_size"], optional=False)) is not None: self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) @@ -5227,7 +5214,6 @@ class KimiLinearModel(TextModel): if len(self._experts[bid]) >= n_experts * 3: # merge the experts into a single 3d tensor - tensors = [] # w1: gate, w2: down, w3: up for wid, tname in [("w1", gguf.MODEL_TENSOR.FFN_GATE_EXP), ("w2", gguf.MODEL_TENSOR.FFN_DOWN_EXP), @@ -5237,12 +5223,10 @@ class KimiLinearModel(TextModel): ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight" datas.append(self._experts[bid][ename]) del self._experts[bid][ename] - data_torch = torch.stack(datas, dim=0) new_name = self.format_tensor_name(tname, bid) - tensors.append((new_name, data_torch)) - return tensors - return [] + yield from super().modify_tensors(data_torch, new_name, bid) + return # note: MLA with the absorption optimization, needs these two split and k_b_proj transposed if name.endswith("kv_b_proj.weight"): @@ -5256,11 +5240,11 @@ class KimiLinearModel(TextModel): kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1]) k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1) k_b = k_b.transpose(1, 2) - return [(self.map_tensor_name(name_kb), k_b), (self.map_tensor_name(name_vb), v_b)] + yield from super().modify_tensors(k_b, name_kb, bid) + yield from super().modify_tensors(v_b, name_vb, bid) + return - mapped_name = self.map_tensor_name(name) - logger.info(f"Returning {mapped_name}: shape after = {tuple(data_torch.shape)}") - return [(mapped_name, data_torch)] + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("InternLM2ForCausalLM") diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index a7e5482008..83349cc9ec 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -34,7 +34,7 @@ static ggml_tensor * causal_conv1d(ggml_cgraph * gf, ggml_context * ctx0, ggml_t ggml_tensor * x_3d = ggml_reshape_3d(ctx0, x_proj, d_inner, n_seq_tokens, n_seqs); // Concat Q conv state and current input: {d_conv-1 + n_seq_tokens, d_inner, n_seqs} - ggml_tensor * conv_x = ggml_cont(ctx0, ggml_concat(ctx0, conv_state_x, ggml_transpose(ctx0, x_3d), 0)); + ggml_tensor * conv_x = ggml_concat(ctx0, conv_state_x, ggml_transpose(ctx0, x_3d), 0); // Save last (d_conv-1) columns back to Q conv state ggml_tensor * last_conv_x = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs, @@ -289,8 +289,6 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll ggml_row_size(kv->type, kv_per_head), ggml_row_size(kv->type, kv_per_head * n_head), ggml_row_size(kv->type, n_embd_head_qk_nope)); - k_nope = ggml_cont(ctx0, k_nope); - Vcur = ggml_cont(ctx0, Vcur); cb(Vcur, "mla_V", il); // Concatenate k_nope + k_pe (broadcast k_pe to all heads) @@ -403,11 +401,6 @@ std::pair llm_build_kimi_linear::build_kda_chunkin ggml_tensor * identity, ggml_tensor * diag_mask, int il) { - GGML_ASSERT(ggml_is_contiguous(q)); - GGML_ASSERT(ggml_is_contiguous(k)); - GGML_ASSERT(ggml_is_contiguous(v)); - GGML_ASSERT(ggml_is_contiguous(gk)); - GGML_ASSERT(ggml_is_contiguous(beta)); GGML_ASSERT(ggml_is_contiguous(state)); const int64_t S_k = q->ne[0]; @@ -694,12 +687,8 @@ std::pair llm_build_kimi_linear::build_kda_autoreg ggml_tensor * beta, ggml_tensor * state, int il) { - GGML_ASSERT(ggml_is_contiguous(q)); - GGML_ASSERT(ggml_is_contiguous(k)); GGML_ASSERT(ggml_is_contiguous(v)); GGML_ASSERT(ggml_is_contiguous(gk)); - GGML_ASSERT(ggml_is_contiguous(beta)); - GGML_ASSERT(ggml_is_contiguous(state)); const int64_t S_k = q->ne[0]; const int64_t H_k = q->ne[1]; From 11282a0f6069e0e986d3faa84de285ccf5d73ce9 Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Sun, 1 Feb 2026 20:12:20 +0800 Subject: [PATCH 58/58] use is_mla to switch between different mem_hybrid types --- src/models/kimi-linear.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index 83349cc9ec..1e533fa51b 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -72,9 +72,11 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll // Note: Kimi MLA does NOT use RoPE (rotary_emb=None in vLLM) // So we don't need inp_pos - auto * inp = build_inp_mem_hybrid_k(); - auto * inp_rs = inp->get_recr(); - auto * inp_attn = inp->get_attn(); + auto * inp_kv = !hparams.is_mla() ? build_inp_mem_hybrid() : nullptr; + auto * inp_k = hparams.is_mla() ? build_inp_mem_hybrid_k() : nullptr; + auto * inp_rs = hparams.is_mla() ? inp_k->get_recr() : inp_kv->get_recr(); + auto * inp_attn_kv = !hparams.is_mla() ? inp_kv->get_attn() : nullptr; + auto * inp_attn_k = hparams.is_mla() ? inp_k->get_attn() : nullptr; // Output ids for selecting which tokens to output ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -272,7 +274,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll ggml_tensor * Vcur = kv_cmpr; cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, layer.wv_b, kq_scale_mla, il); + cur = build_attn(inp_attn_k, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, layer.wv_b, kq_scale_mla, il); cb(cur, "mla_out", il); } else { // MLA KV cache disabled. Fall back to MHA KV cache. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k_mla, n_head, n_tokens); @@ -302,7 +304,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll // Direct softmax attention (with MHA KV cache) // Use build_attn with inp_attn for proper mask handling - cur = build_attn(inp_attn, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il); + cur = build_attn(inp_attn_kv, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il); cb(cur, "mla_out", il); } } else {