From 27baad43d54e7fd875774aded764767cb109baa8 Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Tue, 2 Dec 2025 08:35:14 +0800
Subject: [PATCH 01/58] kimi linear model implementation

---
 src/models/kimi-linear.cpp | 429 +++++++++++++++++++++++++++++++++++++
 1 file changed, 429 insertions(+)
 create mode 100644 src/models/kimi-linear.cpp

diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp
new file mode 100644
index 0000000000..660cd06f0e
--- /dev/null
+++ b/src/models/kimi-linear.cpp
@@ -0,0 +1,429 @@
+#include "models.h"
+
+llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params), model(model) {
+    ggml_tensor * cur;
+    ggml_tensor * inpL;
+
+    inpL = build_inp_embd(model.tok_embd);
+    
+    // Note: Kimi MLA does NOT use RoPE (rotary_emb=None in vLLM)
+    // So we don't need inp_pos
+    
+    // Only use recurrent state input for KDA layers
+    // MLA layers use direct softmax attention without KV cache
+    auto * inp_rs = build_rs_inp();
+    
+    // Input for MLA layers (no KV cache)
+    auto * inp_no_cache = build_attn_inp_no_cache();
+
+    // Output ids for selecting which tokens to output
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+    // Kimi dimension constants
+    const int64_t n_head = hparams.n_head();
+    const int64_t head_dim = hparams.kda_head_dim > 0 ? hparams.kda_head_dim : 128;
+    const int64_t d_conv = hparams.kda_d_conv > 0 ? hparams.kda_d_conv : 4;
+    const int64_t d_inner = n_head * head_dim;  // 32 * 128 = 4096
+    const int64_t n_seqs = ubatch.n_seqs;
+    const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+    
+    // Verify batch consistency for recurrent layers
+    GGML_ASSERT(n_seqs != 0);
+    GGML_ASSERT(ubatch.equal_seqs());
+    GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
+    
+    // MLA params
+    const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla > 0 ? hparams.n_embd_head_k_mla : 192;
+    const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla > 0 ? hparams.n_embd_head_v_mla : 128;
+    const int64_t kv_lora_rank = hparams.n_lora_kv > 0 ? hparams.n_lora_kv : 512;
+    // qk_rope_head_dim = 64 (from Kimi config), NOT hparams.n_rot (which is 72)
+    // Confirmed from tensor shape: wkv_a_mqa [2304, 576] = [n_embd, kv_lora_rank + qk_rope_head_dim]
+    const int64_t n_embd_head_qk_rope = 64;  // config.qk_rope_head_dim
+    const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;  // 192 - 64 = 128
+    
+    // Attention scale for KDA (1/sqrt(head_dim))
+    const float kq_scale_kda = 1.0f / sqrtf((float)head_dim);
+    
+    // Attention scale for MLA
+    const float kq_scale_mla = 1.0f / sqrtf((float)n_embd_head_k_mla);
+
+    for (int il = 0; il < n_layer; ++il) {
+        const auto & layer = model.layers[il];
+        ggml_tensor * inpSA = inpL;
+
+        // Attention Norm
+        cur = build_norm(inpL, layer.attn_norm, NULL, LLM_NORM_RMS, il);
+        cb(cur, "attn_norm", il);
+
+        // Check layer type by checking which tensors exist
+        // KDA layers have ssm_a_log tensor, MLA layers have wkv_a_mqa tensor
+        bool is_kda = (layer.ssm_a_log != nullptr);
+        bool is_mla = (layer.wkv_a_mqa != nullptr);
+        
+        if (is_kda) {
+            // === KDA Layer (Kimi Delta Attention) with Recurrent State ===
+            // Reference: vLLM kda.py
+            
+            const auto * mctx_cur = inp_rs->mctx;
+            const auto kv_head = mctx_cur->get_head();
+            
+            // Get conv states from r_l tensor (Q, K, V each have separate state)
+            ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
+            const int64_t conv_state_size = (d_conv - 1) * d_inner;
+            const int64_t n_embd_r_total = 3 * conv_state_size;  // Q + K + V
+            ggml_tensor * conv_state_all = build_rs(inp_rs, conv_states_all, hparams.n_embd_r(), n_seqs);
+            // conv_state_all is [n_embd_r_total, n_seqs], split into Q, K, V
+            // Each conv state is [(d_conv-1) * d_inner] per sequence, need to reshape to [d_conv-1, d_inner, n_seqs]
+            // Memory layout: for each seq, Q state is first conv_state_size elements, then K, then V
+            // conv_state_all has stride: nb[0] = element_size, nb[1] = n_embd_r_total * element_size
+            
+            // View Q conv state: offset 0, size conv_state_size per seq
+            // conv_state_all is [n_embd_r_total, n_seqs] with memory layout:
+            //   state[i + seq * n_embd_r_total] where i = conv_step + channel * (d_conv-1) + {0, conv_state_size, 2*conv_state_size} for Q/K/V
+            // We want [d_conv-1, d_inner, n_seqs] view:
+            //   nb1 = (d_conv-1) * element_size (stride between channels)
+            //   nb2 = n_embd_r_total * element_size (stride between seqs)
+            ggml_tensor * conv_state_q = ggml_view_3d(ctx0, conv_state_all, d_conv - 1, d_inner, n_seqs,
+                (d_conv - 1) * ggml_element_size(conv_state_all),  // nb1: stride between channels
+                n_embd_r_total * ggml_element_size(conv_state_all),  // nb2: stride between seqs
+                0);  // offset for Q
+            ggml_tensor * conv_state_k = ggml_view_3d(ctx0, conv_state_all, d_conv - 1, d_inner, n_seqs,
+                (d_conv - 1) * ggml_element_size(conv_state_all),
+                n_embd_r_total * ggml_element_size(conv_state_all),
+                conv_state_size * ggml_element_size(conv_state_all));  // offset for K
+            ggml_tensor * conv_state_v = ggml_view_3d(ctx0, conv_state_all, d_conv - 1, d_inner, n_seqs,
+                (d_conv - 1) * ggml_element_size(conv_state_all),
+                n_embd_r_total * ggml_element_size(conv_state_all),
+                2 * conv_state_size * ggml_element_size(conv_state_all));  // offset for V
+            
+            // Step 1: Q, K, V projections -> [d_inner, n_tokens]
+            ggml_tensor * q_proj = ggml_mul_mat(ctx0, layer.wq, cur);
+            ggml_tensor * k_proj = ggml_mul_mat(ctx0, layer.wk, cur);
+            ggml_tensor * v_proj = ggml_mul_mat(ctx0, layer.wv, cur);
+            cb(q_proj, "kda_q_proj", il);
+            cb(k_proj, "kda_k_proj", il);
+            cb(v_proj, "kda_v_proj", il);
+            
+            // Step 2: Causal Conv1d for Q
+            // Reshape input: {d_inner, n_tokens} -> {d_inner, n_seq_tokens, n_seqs}
+            ggml_tensor * q_3d = ggml_reshape_3d(ctx0, q_proj, d_inner, n_seq_tokens, n_seqs);
+            
+            // Concat Q conv state and current input: {d_conv-1 + n_seq_tokens, d_inner, n_seqs}
+            ggml_tensor * conv_q = ggml_concat(ctx0, conv_state_q, ggml_transpose(ctx0, q_3d), 0);
+            
+            // Save last (d_conv-1) columns back to Q conv state
+            ggml_tensor * last_conv_q = ggml_view_3d(ctx0, conv_q, d_conv - 1, d_inner, n_seqs,
+                conv_q->nb[1], conv_q->nb[2], n_seq_tokens * conv_q->nb[0]);
+            ggml_build_forward_expand(gf,
+                ggml_cpy(ctx0, last_conv_q,
+                    ggml_view_1d(ctx0, conv_states_all, conv_state_size * n_seqs,
+                        kv_head * n_embd_r_total * ggml_element_size(conv_states_all))));
+            
+            // Reshape conv weight: GGUF [d_conv, 1, d_inner, 1] -> ggml_ssm_conv expects [d_conv, d_inner]
+            // GGUF stores as [d_conv, 1, d_inner, 1] with memory layout w[conv_step + channel * d_conv]
+            // vLLM stores as [d_inner, d_conv] with memory layout w[channel * d_conv + conv_step]
+            // ggml_ssm_conv computes: c[conv_step + channel * d_conv]
+            // GGUF layout: [d_conv, 1, d_inner] or [d_conv, 1, d_inner, 1] -> reshape to [d_conv, d_inner]
+            ggml_tensor * conv_weight = nullptr;
+            if (layer.ssm_q_conv) {
+                // Reshape conv weight from [d_conv, 1, d_inner, 1] to [d_conv, d_inner] for ggml_ssm_conv
+                // Cast to F32 if quantized (ggml_ssm_conv requires float weights)
+                ggml_tensor * q_conv_f32 = layer.ssm_q_conv;
+                if (q_conv_f32->type != GGML_TYPE_F32) {
+                    q_conv_f32 = ggml_cast(ctx0, q_conv_f32, GGML_TYPE_F32);
+                }
+                conv_weight = ggml_reshape_2d(ctx0, q_conv_f32, d_conv, d_inner);
+            }
+            
+            // Apply conv1d
+            ggml_tensor * Qcur;
+            if (conv_weight) {
+                // Make conv_q contiguous for ggml_ssm_conv
+                conv_q = ggml_cont(ctx0, conv_q);
+                
+                // ggml_ssm_conv output: {d_inner, n_seq_tokens, n_seqs}
+                Qcur = ggml_ssm_conv(ctx0, conv_q, conv_weight);
+                // Reshape to 2D for bias add: {d_inner, n_tokens}
+                Qcur = ggml_reshape_2d(ctx0, Qcur, d_inner, n_tokens);
+                if (layer.ssm_q_conv_b) {
+                    Qcur = ggml_add(ctx0, Qcur, layer.ssm_q_conv_b);
+                }
+                Qcur = ggml_silu(ctx0, Qcur);
+            } else {
+                GGML_ABORT("KDA layer missing Q conv weight");
+            }
+            
+            // K conv1d (with separate K conv state)
+            ggml_tensor * Kcur;
+            if (layer.ssm_k_conv) {
+                ggml_tensor * k_3d = ggml_reshape_3d(ctx0, k_proj, d_inner, n_seq_tokens, n_seqs);
+                ggml_tensor * conv_k = ggml_cont(ctx0, ggml_concat(ctx0, conv_state_k, ggml_transpose(ctx0, k_3d), 0));
+                
+                // Save K conv state
+                ggml_tensor * last_conv_k = ggml_view_3d(ctx0, conv_k, d_conv - 1, d_inner, n_seqs,
+                    conv_k->nb[1], conv_k->nb[2], n_seq_tokens * conv_k->nb[0]);
+                ggml_build_forward_expand(gf,
+                    ggml_cpy(ctx0, last_conv_k,
+                        ggml_view_1d(ctx0, conv_states_all, conv_state_size * n_seqs,
+                            (kv_head * n_embd_r_total + conv_state_size) * ggml_element_size(conv_states_all))));
+                
+                ggml_tensor * k_conv_f32 = layer.ssm_k_conv;
+                if (k_conv_f32->type != GGML_TYPE_F32) {
+                    k_conv_f32 = ggml_cast(ctx0, k_conv_f32, GGML_TYPE_F32);
+                }
+                ggml_tensor * k_conv_weight = ggml_reshape_2d(ctx0, k_conv_f32, d_conv, d_inner);
+                Kcur = ggml_ssm_conv(ctx0, conv_k, k_conv_weight);
+                Kcur = ggml_reshape_2d(ctx0, Kcur, d_inner, n_tokens);
+                if (layer.ssm_k_conv_b) {
+                    Kcur = ggml_add(ctx0, Kcur, layer.ssm_k_conv_b);
+                }
+                Kcur = ggml_silu(ctx0, Kcur);
+            } else {
+                GGML_ABORT("KDA layer missing K conv weight");
+            }
+            
+            // V conv1d (with separate V conv state)
+            ggml_tensor * Vcur;
+            if (layer.ssm_v_conv) {
+                ggml_tensor * v_3d = ggml_reshape_3d(ctx0, v_proj, d_inner, n_seq_tokens, n_seqs);
+                ggml_tensor * conv_v = ggml_cont(ctx0, ggml_concat(ctx0, conv_state_v, ggml_transpose(ctx0, v_3d), 0));
+                
+                // Save V conv state
+                ggml_tensor * last_conv_v = ggml_view_3d(ctx0, conv_v, d_conv - 1, d_inner, n_seqs,
+                    conv_v->nb[1], conv_v->nb[2], n_seq_tokens * conv_v->nb[0]);
+                ggml_build_forward_expand(gf,
+                    ggml_cpy(ctx0, last_conv_v,
+                        ggml_view_1d(ctx0, conv_states_all, conv_state_size * n_seqs,
+                            (kv_head * n_embd_r_total + 2 * conv_state_size) * ggml_element_size(conv_states_all))));
+                
+                ggml_tensor * v_conv_f32 = layer.ssm_v_conv;
+                if (v_conv_f32->type != GGML_TYPE_F32) {
+                    v_conv_f32 = ggml_cast(ctx0, v_conv_f32, GGML_TYPE_F32);
+                }
+                ggml_tensor * v_conv_weight = ggml_reshape_2d(ctx0, v_conv_f32, d_conv, d_inner);
+                Vcur = ggml_ssm_conv(ctx0, conv_v, v_conv_weight);
+                Vcur = ggml_reshape_2d(ctx0, Vcur, d_inner, n_tokens);
+                if (layer.ssm_v_conv_b) {
+                    Vcur = ggml_add(ctx0, Vcur, layer.ssm_v_conv_b);
+                }
+                Vcur = ggml_silu(ctx0, Vcur);
+            } else {
+                GGML_ABORT("KDA layer missing V conv weight");
+            }
+            
+            // Step 3: Compute g1 (forget gate)
+            // g1 = -exp(A_log) * softplus(f_b(f_a(x)) + dt_bias)
+            ggml_tensor * f_a = ggml_mul_mat(ctx0, layer.ssm_f_a, cur);
+            ggml_tensor * g1 = ggml_mul_mat(ctx0, layer.ssm_f_b, f_a);
+            g1 = ggml_add(ctx0, g1, layer.ssm_dt_b);
+            g1 = ggml_softplus(ctx0, g1);
+            g1 = ggml_reshape_3d(ctx0, g1, head_dim, n_head, n_tokens);
+            
+            // A_log shape is [1, n_head] or [1, n_head, 1, 1], need to broadcast to [head_dim, n_head, n_tokens]
+            // First compute -exp(A_log), then reshape for broadcasting
+            ggml_tensor * A_neg_exp = ggml_neg(ctx0, ggml_exp(ctx0, layer.ssm_a_log));
+            // Reshape to [1, n_head, 1] for broadcasting with g1 [head_dim, n_head, n_tokens]
+            A_neg_exp = ggml_reshape_3d(ctx0, A_neg_exp, 1, n_head, 1);
+            g1 = ggml_mul(ctx0, g1, A_neg_exp);
+            cb(g1, "kda_g1", il);
+            
+            // Step 4: Compute beta (mixing coefficient)
+            ggml_tensor * beta = ggml_mul_mat(ctx0, layer.ssm_beta, cur);
+            beta = ggml_sigmoid(ctx0, beta);
+            cb(beta, "kda_beta", il);
+            
+            // Step 5: Reshape for KDA recurrence
+            // {n_embd, n_tokens} -> {n_embd, n_seq_tokens, n_seqs}
+            cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
+            
+            Qcur = ggml_cont(ctx0, ggml_reshape_4d(ctx0, Qcur, head_dim, n_head, n_seq_tokens, n_seqs));
+            Kcur = ggml_cont(ctx0, ggml_reshape_4d(ctx0, Kcur, head_dim, n_head, n_seq_tokens, n_seqs));
+            Vcur = ggml_cont(ctx0, ggml_reshape_4d(ctx0, Vcur, head_dim, n_head, n_seq_tokens, n_seqs));
+            g1 = ggml_cont(ctx0, ggml_reshape_4d(ctx0, g1, head_dim, n_head, n_seq_tokens, n_seqs));
+            beta = ggml_cont(ctx0, ggml_reshape_3d(ctx0, beta, n_head, n_seq_tokens, n_seqs));
+            
+            cb(Qcur, "kda_Q", il);
+            cb(Kcur, "kda_K", il);
+            cb(Vcur, "kda_V", il);
+            
+            // Step 6: Get SSM state and compute KDA recurrence using ggml_kda_scan
+            ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
+            
+            // Use build_rs with lambda pattern (like Mamba SSM scan)
+            auto get_kda_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
+                ggml_tensor * h_state = ggml_reshape_4d(ctx, states, head_dim, head_dim, n_head, mctx_cur->get_size());
+                // Call ggml_kda_scan which implements the correct KDA recurrence
+                return ggml_kda_scan(ctx, h_state, Qcur, Kcur, Vcur, g1, beta, ids);
+            };
+            
+            ggml_tensor * y_kda = build_rs(inp_rs, ssm_states_all, hparams.n_embd_s(), n_seqs, get_kda_rows);
+            cb(y_kda, "kda_scan_out", il);
+            
+            // Store updated state back
+            // y_kda contains: [attention_output (head_dim * n_head * n_seq_tokens * n_seqs), new_state (head_dim * head_dim * n_head * n_seqs)]
+            const int64_t attn_out_size = head_dim * n_head * n_seq_tokens * n_seqs;
+            const int64_t state_size = head_dim * head_dim * n_head;
+            ggml_build_forward_expand(gf, 
+                ggml_cpy(ctx0, 
+                    ggml_view_1d(ctx0, y_kda, state_size * n_seqs, attn_out_size * ggml_element_size(y_kda)),
+                    ggml_view_1d(ctx0, ssm_states_all, state_size * n_seqs, kv_head * state_size * ggml_element_size(ssm_states_all))));
+            
+            // Extract attention output
+            ggml_tensor * attn_out = ggml_view_1d(ctx0, y_kda, attn_out_size, 0);
+            attn_out = ggml_reshape_3d(ctx0, attn_out, head_dim, n_head, n_seq_tokens * n_seqs);
+            cb(attn_out, "kda_attn_out", il);
+            
+            // Step 7: Output gating g2 = g_b(g_a(x))
+            ggml_tensor * cur_2d = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
+            ggml_tensor * g_a = ggml_mul_mat(ctx0, layer.ssm_g_a, cur_2d);
+            ggml_tensor * g2 = ggml_mul_mat(ctx0, layer.ssm_g_b, g_a);
+            g2 = ggml_reshape_3d(ctx0, g2, head_dim, n_head, n_seq_tokens * n_seqs);
+            
+            // Step 8: Apply o_norm with sigmoid gating
+            // Note: Kimi model uses sigmoid gating, not SiLU (despite FusedRMSNormGated default being swish)
+            // Formula: output = RMSNorm(x) * sigmoid(g)
+            ggml_tensor * normed = build_norm(attn_out, layer.ssm_o_norm, layer.ssm_o_norm_b, LLM_NORM_RMS, il);
+            ggml_tensor * gate = ggml_sigmoid(ctx0, g2);
+            ggml_tensor * gated = ggml_mul(ctx0, normed, gate);
+            
+            // Step 9: Output projection
+            gated = ggml_cont_2d(ctx0, gated, d_inner, n_tokens);
+            cur = ggml_mul_mat(ctx0, layer.wo, gated);
+            cb(cur, "kda_out", il);
+            
+            
+            GGML_UNUSED(d_conv);
+            GGML_UNUSED(kq_scale_kda);
+            
+        } else if (is_mla) {
+            // === MLA Layer (Multi-head Latent Attention) without KV Cache ===
+            // Reference: vLLM mla.py
+            // TODO: Implement proper KV caching for MLA (requires custom cache format)
+            
+            // Step 1: Q projection and reshape
+            // vLLM Kimi: q = q_proj(hidden_states), then view as [n_tokens, n_head, qk_head_dim]
+            // Note: Kimi MLA does NOT use RoPE (rotary_emb=None in vLLM)
+            ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.wq, cur);
+            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k_mla, n_head, n_tokens);
+            cb(Qcur, "mla_Q", il);
+            
+            // Step 2: KV compression
+            // kv_lora = kv_a_proj_with_mqa(hidden_states) -> [kv_lora_rank + qk_rope_head_dim, n_tokens]
+            ggml_tensor * kv_lora = ggml_mul_mat(ctx0, layer.wkv_a_mqa, cur);
+            
+            // Split: kv_c = kv_lora[:kv_lora_rank], k_pe = kv_lora[kv_lora_rank:]
+            ggml_tensor * kv_c = ggml_view_2d(ctx0, kv_lora, kv_lora_rank, n_tokens,
+                ggml_row_size(kv_lora->type, kv_lora_rank + n_embd_head_qk_rope), 0);
+            ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_lora, n_embd_head_qk_rope, 1, n_tokens,
+                ggml_row_size(kv_lora->type, kv_lora_rank + n_embd_head_qk_rope),
+                ggml_row_size(kv_lora->type, kv_lora_rank + n_embd_head_qk_rope),
+                ggml_row_size(kv_lora->type, kv_lora_rank));
+            
+            // Note: Kimi MLA does NOT apply RoPE (rotary_emb=None in vLLM)
+            // k_pe is used directly without RoPE
+            
+            // Normalize kv_c
+            kv_c = build_norm(kv_c, layer.attn_kv_a_norm, nullptr, LLM_NORM_RMS, il);
+            
+            // KV decompression: kv = kv_b_proj(kv_c_normed)
+            ggml_tensor * kv = ggml_mul_mat(ctx0, layer.wkv_b, kv_c);
+            const int64_t kv_per_head = n_embd_head_qk_nope + n_embd_head_v_mla;
+            
+            // Split kv into k_nope and v
+            ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
+                ggml_row_size(kv->type, kv_per_head),
+                ggml_row_size(kv->type, kv_per_head * n_head), 0);
+            ggml_tensor * Vcur = ggml_view_3d(ctx0, kv, n_embd_head_v_mla, n_head, n_tokens,
+                ggml_row_size(kv->type, kv_per_head),
+                ggml_row_size(kv->type, kv_per_head * n_head),
+                ggml_row_size(kv->type, n_embd_head_qk_nope));
+            k_nope = ggml_cont(ctx0, k_nope);
+            Vcur = ggml_cont(ctx0, Vcur);
+            
+            // Concatenate k_nope + k_pe (broadcast k_pe to all heads)
+            // K = [k_nope, k_pe] where k_nope is [qk_nope_head_dim, n_head, n_tokens]
+            // and k_pe is [qk_rope_head_dim, 1, n_tokens] broadcast to all heads
+            k_pe = ggml_cont(ctx0, k_pe);
+            // Need to broadcast k_pe from [qk_rope, 1, n_tokens] to [qk_rope, n_head, n_tokens]
+            ggml_tensor * k_pe_target = ggml_new_tensor_3d(ctx0, k_pe->type, n_embd_head_qk_rope, n_head, n_tokens);
+            ggml_tensor * k_pe_repeated = ggml_repeat(ctx0, k_pe, k_pe_target);
+            ggml_tensor * Kcur = ggml_concat(ctx0, k_nope, k_pe_repeated, 0);
+            cb(Kcur, "mla_K", il);
+            cb(Vcur, "mla_V", il);
+            
+            // Direct softmax attention (without KV cache)
+            // Use build_attn with inp_no_cache for proper mask handling
+            cur = build_attn(inp_no_cache, layer.wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il);
+            cb(cur, "mla_out", il);
+            
+        } else {
+            // Unknown layer type - this should not happen
+            GGML_ABORT("Kimi layer is neither KDA nor MLA - missing required tensors");
+        }
+        
+        // On last layer, select only the output tokens
+        if (il == n_layer - 1 && inp_out_ids) {
+            cur   = ggml_get_rows(ctx0, cur,   inp_out_ids);
+            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+        }
+        
+        // Residual
+        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+        cb(ffn_inp, "ffn_inp", il);
+
+        // FFN Norm
+        cur = build_norm(ffn_inp, layer.ffn_norm, NULL, LLM_NORM_RMS, il);
+        cb(cur, "ffn_norm", il);
+
+        // FFN / MoE
+        if (layer.ffn_gate_inp) {
+            // MoE layer
+            // Kimi uses moe_renormalize=True and routed_scaling_factor (stored as expert_weights_scale) = 2.446
+            ggml_tensor * moe_out = build_moe_ffn(cur, layer.ffn_gate_inp, layer.ffn_up_exps, layer.ffn_gate_exps, layer.ffn_down_exps, 
+                                layer.ffn_exp_probs_b, hparams.n_expert, hparams.n_expert_used, 
+                                LLM_FFN_SILU, true, true, hparams.expert_weights_scale,
+                                (llama_expert_gating_func_type) hparams.expert_gating_func, il);
+            cb(moe_out, "ffn_moe_out", il);
+            
+            // Shared expert (if present)
+            if (layer.ffn_gate_shexp) {
+                ggml_tensor * ffn_shexp = build_ffn(cur,
+                        layer.ffn_up_shexp, NULL, NULL,
+                        layer.ffn_gate_shexp, NULL, NULL,
+                        layer.ffn_down_shexp, NULL, NULL,
+                        NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+                cb(ffn_shexp, "ffn_shexp", il);
+                
+                cur = ggml_add(ctx0, moe_out, ffn_shexp);
+                cb(cur, "ffn_out", il);
+            } else {
+                cur = moe_out;
+            }
+        } else if (layer.ffn_gate) {
+            // Dense FFN layer
+            cur = build_ffn(cur, layer.ffn_up, NULL, NULL, layer.ffn_gate, NULL, NULL, 
+                           layer.ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+            cb(cur, "ffn_out", il);
+        } else {
+            // No FFN - this should not happen in Kimi
+            GGML_ABORT("Kimi layer missing FFN tensors");
+        }
+
+        // Residual
+        cur = ggml_add(ctx0, cur, ffn_inp);
+        inpL = cur;
+    }
+
+    // Final Norm
+    cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1);
+    cb(cur, "result_norm", -1);
+
+    // Output
+    cur = ggml_mul_mat(ctx0, model.output, cur);
+    cb(cur, "result_output", -1);
+    res->t_logits = cur;
+
+    ggml_build_forward_expand(gf, cur);
+    
+    GGML_UNUSED(n_embd_head_qk_nope);
+}

From 84f822c5a58b0967e312c85cea4f3bbbea97184e Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Tue, 2 Dec 2025 08:51:09 +0800
Subject: [PATCH 02/58] kimi linear convert_hf_to_gguf

---
 convert_hf_to_gguf.py | 293 ++++++++++++++++++++++++++++++++++++++++++
 src/models/models.h   |   6 +
 2 files changed, 299 insertions(+)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index a54cce887b..11dd9f610a 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -563,6 +563,10 @@ class ModelBase:
                             gguf.MODEL_TENSOR.A_ENC_EMBD_POS,
                             gguf.MODEL_TENSOR.ALTUP_CORRECT_COEF,
                             gguf.MODEL_TENSOR.ALTUP_PREDICT_COEF,
+                            # Kimi KDA conv weights should be F32
+                            gguf.MODEL_TENSOR.SSM_CONV1D_Q,
+                            gguf.MODEL_TENSOR.SSM_CONV1D_K,
+                            gguf.MODEL_TENSOR.SSM_CONV1D_V,
                         )
                     )
                     or new_name[-7:] not in (".weight", ".lora_a", ".lora_b")
@@ -4976,6 +4980,295 @@ class CodeShellModel(TextModel):
         self.gguf_writer.add_rope_scaling_factor(1.0)
 
 
+@ModelBase.register("KimiLinearModel", "KimiLinearForCausalLM")
+class KimiLinearModel(TextModel):
+    """Kimi-Linear model with hybrid MLA+KDA architecture"""
+    model_arch = gguf.MODEL_ARCH.KIMI_LINEAR
+    
+    _experts: list[dict[str, Tensor]] | None = None
+
+    def set_gguf_parameters(self):
+        self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
+        
+        # Use find_hparam for context length
+        # Kimi uses model_max_length
+        n_ctx = self.find_hparam(["max_position_embeddings", "model_max_length", "n_ctx", "n_positions"], optional=True)
+        if n_ctx is not None:
+            self.gguf_writer.add_context_length(n_ctx)
+        else:
+            # Default to 4096 if not found
+            logger.warning("No context length found in config, defaulting to 4096")
+            self.gguf_writer.add_context_length(4096)
+        
+        self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
+        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
+        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
+        self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
+        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
+        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
+        self.gguf_writer.add_file_type(self.ftype)
+
+        # KDA & MLA params
+        # Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv
+        linear_attn_config = self.hparams.get("linear_attn_config", {})
+        ssm_d_conv = self.hparams.get("ssm_d_conv") or linear_attn_config.get("short_conv_kernel_size")
+        if ssm_d_conv is not None:
+             self.gguf_writer.add_ssm_conv_kernel(ssm_d_conv)
+        
+        # MLA params - use add_* methods that handle arch substitution
+        # Support both HuggingFace naming (q_lora_rank, kv_lora_rank) and internal naming (n_lora_q, n_lora_kv)
+        q_lora_rank = self.hparams.get("q_lora_rank", self.hparams.get("n_lora_q"))
+        kv_lora_rank = self.hparams.get("kv_lora_rank", self.hparams.get("n_lora_kv"))
+        
+        if q_lora_rank is not None:
+             self.gguf_writer.add_q_lora_rank(q_lora_rank)
+        if kv_lora_rank is not None:
+             self.gguf_writer.add_kv_lora_rank(kv_lora_rank)
+        
+        # MLA head dimensions
+        # Support HuggingFace naming: qk_nope_head_dim, qk_rope_head_dim, v_head_dim
+        qk_nope_head_dim = self.hparams.get("qk_nope_head_dim")
+        qk_rope_head_dim = self.hparams.get("qk_rope_head_dim", self.hparams.get("n_rot"))
+        v_head_dim = self.hparams.get("v_head_dim")
+        
+        # Calculate n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim
+        if "n_embd_head_k_mla" in self.hparams:
+             self.gguf_writer.add_key_length_mla(self.hparams["n_embd_head_k_mla"])
+        elif qk_nope_head_dim is not None and qk_rope_head_dim is not None:
+             n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim
+             self.gguf_writer.add_key_length_mla(n_embd_head_k_mla)
+        
+        # n_embd_head_v_mla = v_head_dim
+        if "n_embd_head_v_mla" in self.hparams:
+             self.gguf_writer.add_value_length_mla(self.hparams["n_embd_head_v_mla"])
+        elif v_head_dim is not None:
+             self.gguf_writer.add_value_length_mla(v_head_dim)
+        
+        # Rotation - use qk_rope_head_dim for Kimi
+        rope_dim = self.hparams.get("qk_rope_head_dim") or self.hparams.get("n_rot")
+        if rope_dim is not None:
+             self.gguf_writer.add_rope_dimension_count(rope_dim)
+        else:
+             # Default to head_dim
+             head_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
+             self.gguf_writer.add_rope_dimension_count(head_dim)
+
+        self.gguf_writer.add_rope_freq_base(self.hparams.get("rope_theta", 10000.0))
+
+        # MoE params
+        n_experts = self.hparams.get("num_local_experts", self.hparams.get("num_experts"))
+        if n_experts is not None:
+            self.gguf_writer.add_expert_count(n_experts)
+        # Support both num_experts_per_tok and num_experts_per_token
+        n_experts_used = self.hparams.get("num_experts_per_tok", self.hparams.get("num_experts_per_token"))
+        if n_experts_used is not None:
+            self.gguf_writer.add_expert_used_count(n_experts_used)
+        
+        # moe_intermediate_size (1024 for Kimi)
+        moe_intermediate_size = self.hparams.get("moe_intermediate_size")
+        if moe_intermediate_size is not None:
+            self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
+        
+        # num_shared_experts (1 for Kimi)
+        num_shared_experts = self.hparams.get("num_shared_experts")
+        if num_shared_experts is not None:
+            self.gguf_writer.add_expert_shared_count(num_shared_experts)
+        
+        # first_k_dense_replace (1 for Kimi - first layer uses dense MLP)
+        first_k_dense_replace = self.hparams.get("first_k_dense_replace")
+        if first_k_dense_replace is not None:
+            self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace)
+        
+        # Expert gating function (sigmoid for Kimi)
+        moe_router_activation_func = self.hparams.get("moe_router_activation_func", "sigmoid")
+        if moe_router_activation_func == "sigmoid":
+            self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
+        elif moe_router_activation_func == "softmax":
+            self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
+        else:
+            logger.warning(f"Unknown moe_router_activation_func: {moe_router_activation_func}, defaulting to sigmoid")
+            self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
+        
+        # Routed scaling factor (expert_weights_scale = 2.446 for Kimi)
+        routed_scaling_factor = self.hparams.get("routed_scaling_factor")
+        if routed_scaling_factor is not None:
+            self.gguf_writer.add_expert_weights_scale(routed_scaling_factor)
+
+    def set_vocab(self):
+        # Kimi uses TikToken tokenizer - load via transformers
+        from transformers import AutoTokenizer
+        
+        dir_model = self.dir_model
+        vocab_size = self.hparams["vocab_size"]
+        
+        logger.info(f"Loading TikToken tokenizer from {dir_model}")
+        tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
+        
+        tokens: list[str] = []
+        toktypes: list[int] = []
+        
+        # Get tokenizer pre string
+        tokpre = self.get_vocab_base_pre(tokenizer)
+        
+        # Build vocab from tokenizer
+        merges = []
+        vocab = {}
+        
+        # TikToken stores vocab in mergeable_ranks
+        if hasattr(tokenizer, 'mergeable_ranks'):
+            mergeable_ranks = tokenizer.mergeable_ranks
+            for token, rank in mergeable_ranks.items():
+                vocab[self._token_bytes_to_string(token)] = rank
+                if len(token) == 1:
+                    continue
+                # Build merges
+                merged = self._bpe(mergeable_ranks, token, max_rank=rank)
+                if len(merged) == 2:
+                    merges.append(' '.join(map(self._token_bytes_to_string, merged)))
+        else:
+            # Fallback: get vocab directly
+            vocab = {tok: idx for tok, idx in tokenizer.get_vocab().items()}
+        
+        # Get special tokens
+        added_vocab = {}
+        if hasattr(tokenizer, 'special_tokens'):
+            added_vocab = tokenizer.special_tokens
+        elif hasattr(tokenizer, 'added_tokens_encoder'):
+            added_vocab = tokenizer.added_tokens_encoder
+        
+        # Combine vocab
+        reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
+        
+        for i in range(vocab_size):
+            if i not in reverse_vocab:
+                tokens.append(f"[PAD{i}]")
+                toktypes.append(gguf.TokenType.UNUSED)
+            elif i in added_vocab.values() if added_vocab else False:
+                tokens.append(reverse_vocab[i])
+                toktypes.append(gguf.TokenType.CONTROL)
+            else:
+                tokens.append(reverse_vocab[i])
+                toktypes.append(gguf.TokenType.NORMAL)
+        
+        self.gguf_writer.add_tokenizer_model("gpt2")
+        self.gguf_writer.add_tokenizer_pre(tokpre)
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_types(toktypes)
+        
+        special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
+        special_vocab.merges = merges
+        special_vocab.add_to_gguf(self.gguf_writer)
+        logger.info(f"Loaded {len(tokens)} tokens, {len(merges)} merges")
+    
+    @staticmethod
+    def _token_bytes_to_string(b: bytes) -> str:
+        """Convert bytes to string representation for tokenizer"""
+        return ''.join([chr(byte) if byte < 128 else f'<0x{byte:02X}>' for byte in b])
+    
+    @staticmethod  
+    def _bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
+        """BPE tokenization for merges extraction"""
+        parts = [bytes([b]) for b in token]
+        while True:
+            min_idx = None
+            min_rank = None
+            for i, pair in enumerate(zip(parts[:-1], parts[1:])):
+                rank = mergeable_ranks.get(pair[0] + pair[1])
+                if rank is not None and (min_rank is None or rank < min_rank):
+                    min_idx = i
+                    min_rank = rank
+            if min_rank is None or (max_rank is not None and min_rank >= max_rank):
+                break
+            parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
+        return parts
+
+    def prepare_tensors(self):
+        super().prepare_tensors()
+        if self._experts is not None:
+             experts = [k for d in self._experts for k in d.keys()]
+             if len(experts) > 0:
+                 raise ValueError(f"Unprocessed experts: {experts}")
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        logger.info(f"Processing {name}: shape before = {tuple(data_torch.shape)}")
+        
+        # Handle KDA conv1d weights
+        # HuggingFace/vLLM stores as [d_inner, d_conv] (2D), memory layout: conv_step changes fastest
+        # llama.cpp expects ggml ne = [d_conv, 1, d_inner, 1], memory layout: ne[0]=d_conv changes fastest
+        # GGUF reverses numpy shape when writing, so numpy (1, d_inner, 1, d_conv) -> ggml ne = [d_conv, 1, d_inner, 1]
+        # Memory layouts match: both have conv_step (d_conv) changing fastest
+        if name.endswith((".q_conv1d.weight", ".k_conv1d.weight", ".v_conv1d.weight")):
+            # HF shape: [d_inner, d_conv] e.g. [4096, 4]
+            # Target numpy shape: (1, d_inner, 1, d_conv) -> ggml ne = [d_conv, 1, d_inner, 1]
+            if data_torch.ndim == 2:
+                d_inner, d_conv = data_torch.shape
+                # Reshape to (1, d_inner, 1, d_conv) - memory layout preserved (d_conv fastest)
+                data_torch = data_torch.reshape(1, d_inner, 1, d_conv)
+                logger.info(f"Reshaped conv1d weight {name}: [d_inner={d_inner}, d_conv={d_conv}] -> numpy {tuple(data_torch.shape)} -> ggml ne=[{d_conv}, 1, {d_inner}, 1]")
+            elif data_torch.ndim == 3:
+                # Already 3D [d_inner, 1, d_conv] from unsqueeze
+                d_inner, _, d_conv = data_torch.shape
+                data_torch = data_torch.reshape(1, d_inner, 1, d_conv)
+                logger.info(f"Reshaped conv1d weight {name}: [d_inner={d_inner}, 1, d_conv={d_conv}] -> numpy {tuple(data_torch.shape)} -> ggml ne=[{d_conv}, 1, {d_inner}, 1]")
+        
+        # Handle A_log: HF stores as [1, 1, num_heads, 1]
+        # llama.cpp expects ggml ne = [1, num_heads, 1, 1]
+        # GGUF reverses numpy shape: numpy (1, 1, num_heads, 1) -> ggml ne = [1, num_heads, 1, 1]
+        # So no transformation needed! The shapes already match after GGUF reversal.
+        if name.endswith(".A_log"):
+            if data_torch.ndim == 4:
+                logger.info(f"A_log {name}: numpy {tuple(data_torch.shape)} -> ggml ne={list(reversed(data_torch.shape))}")
+        
+        # Kimi specific bias
+        if name.endswith("block_sparse_moe.gate.e_score_correction_bias"):
+             new_name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_EXP_PROBS_B, bid)
+             return [(new_name, data_torch)]
+
+        # process the experts separately
+        if name.find("block_sparse_moe.experts") != -1:
+            n_experts = self.hparams.get("num_local_experts", self.hparams.get("num_experts"))
+            assert bid is not None
+
+            if self._experts is None:
+                self._experts = [{} for _ in range(self.block_count)]
+
+            self._experts[bid][name] = data_torch
+
+            if len(self._experts[bid]) >= n_experts * 3:
+                # merge the experts into a single 3d tensor
+                tensors = []
+                # w1: gate, w2: down, w3: up
+                for wid, tname in [("w1", gguf.MODEL_TENSOR.FFN_GATE_EXP), 
+                                   ("w2", gguf.MODEL_TENSOR.FFN_DOWN_EXP), 
+                                   ("w3", gguf.MODEL_TENSOR.FFN_UP_EXP)]:
+                    datas: list[Tensor] = []
+                    for xid in range(n_experts):
+                        ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight"
+                        datas.append(self._experts[bid][ename])
+                        del self._experts[bid][ename]
+
+                    data_torch = torch.stack(datas, dim=0)
+                    new_name = self.format_tensor_name(tname, bid)
+                    tensors.append((new_name, data_torch))
+                return tensors
+            return []
+        
+        mapped_name = self.map_tensor_name(name)
+        logger.info(f"Returning {mapped_name}: shape after = {tuple(data_torch.shape)}")
+        return [(mapped_name, data_torch)]
+
+    def get_vocab_base(self) -> tuple[list[str], list[int], str]:
+        # This method is not used when set_vocab is overridden
+        # But adding it for completeness in case it's called elsewhere
+        logger.warning("get_vocab_base called, but set_vocab is already overridden")
+        vocab_size = self.hparams.get("vocab_size", 100)
+        tokens = [f"<token_{i}>" for i in range(vocab_size)]
+        tokens[0] = "<unk>"
+        tokens[1] = "<s>"
+        tokens[2] = "</s>"
+        toktypes = [gguf.TokenType.NORMAL] * vocab_size
+        return tokens, toktypes, "gpt-2"
+
 @ModelBase.register("InternLM2ForCausalLM")
 class InternLM2Model(TextModel):
     model_arch = gguf.MODEL_ARCH.INTERNLM2
diff --git a/src/models/models.h b/src/models/models.h
index d93601ad06..8b7af8d7bc 100644
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -283,6 +283,12 @@ struct llm_build_jamba : public llm_graph_context_mamba {
     llm_build_jamba(const llama_model & model, const llm_graph_params & params);
 };
 
+struct llm_build_kimi_linear : public llm_graph_context_mamba {
+    llm_build_kimi_linear(const llama_model & model, const llm_graph_params & params);
+private:
+    const llama_model & model;
+};
+
 struct llm_build_lfm2 : public llm_graph_context {
     const llama_model & model;
 

From 57cca52779d97cf21a5e8fdbb540467353280e0f Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Tue, 2 Dec 2025 10:40:44 +0800
Subject: [PATCH 03/58] kimi linear constants.py tensor_mapping.py

---
 convert_hf_to_gguf.py          | 47 ++------------------------
 gguf-py/gguf/constants.py      | 61 ++++++++++++++++++++++++++++++++++
 gguf-py/gguf/tensor_mapping.py | 37 +++++++++++++++++++++
 src/CMakeLists.txt             |  1 +
 4 files changed, 102 insertions(+), 44 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 11dd9f610a..ba21124d6f 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -4988,6 +4988,7 @@ class KimiLinearModel(TextModel):
     _experts: list[dict[str, Tensor]] | None = None
 
     def set_gguf_parameters(self):
+        super().set_gguf_parameters()
         self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
         
         # Use find_hparam for context length
@@ -5000,14 +5001,6 @@ class KimiLinearModel(TextModel):
             logger.warning("No context length found in config, defaulting to 4096")
             self.gguf_writer.add_context_length(4096)
         
-        self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
-        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
-        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
-        self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
-        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
-        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
-        self.gguf_writer.add_file_type(self.ftype)
-
         # KDA & MLA params
         # Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv
         linear_attn_config = self.hparams.get("linear_attn_config", {})
@@ -5053,17 +5046,6 @@ class KimiLinearModel(TextModel):
              head_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
              self.gguf_writer.add_rope_dimension_count(head_dim)
 
-        self.gguf_writer.add_rope_freq_base(self.hparams.get("rope_theta", 10000.0))
-
-        # MoE params
-        n_experts = self.hparams.get("num_local_experts", self.hparams.get("num_experts"))
-        if n_experts is not None:
-            self.gguf_writer.add_expert_count(n_experts)
-        # Support both num_experts_per_tok and num_experts_per_token
-        n_experts_used = self.hparams.get("num_experts_per_tok", self.hparams.get("num_experts_per_token"))
-        if n_experts_used is not None:
-            self.gguf_writer.add_expert_used_count(n_experts_used)
-        
         # moe_intermediate_size (1024 for Kimi)
         moe_intermediate_size = self.hparams.get("moe_intermediate_size")
         if moe_intermediate_size is not None:
@@ -5079,16 +5061,6 @@ class KimiLinearModel(TextModel):
         if first_k_dense_replace is not None:
             self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace)
         
-        # Expert gating function (sigmoid for Kimi)
-        moe_router_activation_func = self.hparams.get("moe_router_activation_func", "sigmoid")
-        if moe_router_activation_func == "sigmoid":
-            self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
-        elif moe_router_activation_func == "softmax":
-            self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
-        else:
-            logger.warning(f"Unknown moe_router_activation_func: {moe_router_activation_func}, defaulting to sigmoid")
-            self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
-        
         # Routed scaling factor (expert_weights_scale = 2.446 for Kimi)
         routed_scaling_factor = self.hparams.get("routed_scaling_factor")
         if routed_scaling_factor is not None:
@@ -5220,9 +5192,8 @@ class KimiLinearModel(TextModel):
                 logger.info(f"A_log {name}: numpy {tuple(data_torch.shape)} -> ggml ne={list(reversed(data_torch.shape))}")
         
         # Kimi specific bias
-        if name.endswith("block_sparse_moe.gate.e_score_correction_bias"):
-             new_name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_EXP_PROBS_B, bid)
-             return [(new_name, data_torch)]
+        if name.endswith("e_score_correction_bias"):
+             name = name.replace("e_score_correction_bias", "e_score_correction.bias")
 
         # process the experts separately
         if name.find("block_sparse_moe.experts") != -1:
@@ -5257,18 +5228,6 @@ class KimiLinearModel(TextModel):
         logger.info(f"Returning {mapped_name}: shape after = {tuple(data_torch.shape)}")
         return [(mapped_name, data_torch)]
 
-    def get_vocab_base(self) -> tuple[list[str], list[int], str]:
-        # This method is not used when set_vocab is overridden
-        # But adding it for completeness in case it's called elsewhere
-        logger.warning("get_vocab_base called, but set_vocab is already overridden")
-        vocab_size = self.hparams.get("vocab_size", 100)
-        tokens = [f"<token_{i}>" for i in range(vocab_size)]
-        tokens[0] = "<unk>"
-        tokens[1] = "<s>"
-        tokens[2] = "</s>"
-        toktypes = [gguf.TokenType.NORMAL] * vocab_size
-        return tokens, toktypes, "gpt-2"
-
 @ModelBase.register("InternLM2ForCausalLM")
 class InternLM2Model(TextModel):
     model_arch = gguf.MODEL_ARCH.INTERNLM2
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 2b8489c591..485c41abfb 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -446,6 +446,7 @@ class MODEL_ARCH(IntEnum):
     RND1             = auto()
     PANGU_EMBED      = auto()
     MISTRAL3         = auto()
+    KIMI_LINEAR      = auto()  # Kimi-Linear (hybrid MLA+KDA)
 
 
 class VISION_PROJECTOR_TYPE(IntEnum):
@@ -535,6 +536,16 @@ class MODEL_TENSOR(IntEnum):
     SSM_NORM             = auto()
     SSM_OUT              = auto()
     SSM_BETA_ALPHA       = auto() # qwen3next
+    SSM_CONV1D_Q         = auto() # Kimi Linear
+    SSM_CONV1D_K         = auto() # Kimi Linear
+    SSM_CONV1D_V         = auto() # Kimi Linear
+    SSM_F_A              = auto() # Kimi Linear
+    SSM_F_B              = auto() # Kimi Linear
+    SSM_BETA             = auto() # Kimi Linear
+    SSM_A_LOG            = auto() # Kimi Linear
+    SSM_G_A              = auto() # Kimi Linear
+    SSM_G_B              = auto() # Kimi Linear
+    SSM_DT_B             = auto() # Kimi Linear
     TIME_MIX_W0          = auto()
     TIME_MIX_W1          = auto()
     TIME_MIX_W2          = auto()
@@ -820,6 +831,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
     MODEL_ARCH.RND1:             "rnd1",
     MODEL_ARCH.PANGU_EMBED:      "pangu-embedded",
     MODEL_ARCH.MISTRAL3:         "mistral3",
+    MODEL_ARCH.KIMI_LINEAR:         "kimi-linear",
 }
 
 VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
@@ -907,6 +919,16 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
     MODEL_TENSOR.SSM_NORM:                  "blk.{bid}.ssm_norm",
     MODEL_TENSOR.SSM_OUT:                   "blk.{bid}.ssm_out",
     MODEL_TENSOR.SSM_BETA_ALPHA:            "blk.{bid}.ssm_ba",
+    MODEL_TENSOR.SSM_CONV1D_Q:              "blk.{bid}.ssm_conv1d_q",         # Kimi Linear
+    MODEL_TENSOR.SSM_CONV1D_K:              "blk.{bid}.ssm_conv1d_k",         # Kimi Linear
+    MODEL_TENSOR.SSM_CONV1D_V:              "blk.{bid}.ssm_conv1d_v",         # Kimi Linear
+    MODEL_TENSOR.SSM_F_A:                   "blk.{bid}.ssm_f_a",              # Kimi Linear
+    MODEL_TENSOR.SSM_F_B:                   "blk.{bid}.ssm_f_b",              # Kimi Linear
+    MODEL_TENSOR.SSM_BETA:                  "blk.{bid}.ssm_beta",             # Kimi Linear
+    MODEL_TENSOR.SSM_A_LOG:                 "blk.{bid}.ssm_a",                # Kimi Linear
+    MODEL_TENSOR.SSM_G_A:                   "blk.{bid}.ssm_g_a",              # Kimi Linear
+    MODEL_TENSOR.SSM_G_B:                   "blk.{bid}.ssm_g_b",              # Kimi Linear
+    MODEL_TENSOR.SSM_DT_B:                  "blk.{bid}.ssm_dt",               # Kimi Linear
     MODEL_TENSOR.TIME_MIX_W0:               "blk.{bid}.time_mix_w0",
     MODEL_TENSOR.TIME_MIX_W1:               "blk.{bid}.time_mix_w1",
     MODEL_TENSOR.TIME_MIX_W2:               "blk.{bid}.time_mix_w2",
@@ -3094,6 +3116,45 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.FFN_DOWN_EXP,
         MODEL_TENSOR.FFN_UP_EXP,
     ],
+    MODEL_ARCH.KIMI_LINEAR: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_Q_A,
+        MODEL_TENSOR.ATTN_Q_B,
+        MODEL_TENSOR.ATTN_KV_A_MQA,
+        MODEL_TENSOR.ATTN_KV_B,
+        MODEL_TENSOR.ATTN_Q_A_NORM,
+        MODEL_TENSOR.ATTN_KV_A_NORM,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+        MODEL_TENSOR.SSM_CONV1D_Q,
+        MODEL_TENSOR.SSM_CONV1D_K,
+        MODEL_TENSOR.SSM_CONV1D_V,
+        MODEL_TENSOR.SSM_F_A,
+        MODEL_TENSOR.SSM_F_B,
+        MODEL_TENSOR.SSM_BETA,
+        MODEL_TENSOR.SSM_A_LOG,
+        MODEL_TENSOR.SSM_G_A,
+        MODEL_TENSOR.SSM_G_B,
+        MODEL_TENSOR.SSM_NORM,
+        MODEL_TENSOR.SSM_DT_B,
+        MODEL_TENSOR.FFN_EXP_PROBS_B,
+        MODEL_TENSOR.FFN_GATE_SHEXP,
+        MODEL_TENSOR.FFN_DOWN_SHEXP,
+        MODEL_TENSOR.FFN_UP_SHEXP,
+    ],
     # TODO
 }
 
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index a7b0973979..cfe541fc41 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -389,6 +389,7 @@ class TensorNameMap:
             "model.layers.{bid}.mlp.expert_bias",                           # afmoe
             "model.layers.{bid}.feed_forward.expert_bias",                  # lfm2moe
             "model.layers.{bid}.block_sparse_moe.e_score_correction",       # minimax-m2
+            "model.layers.{bid}.block_sparse_moe.gate.e_score_correction",  # kimi
         ),
 
         # Feed-forward up
@@ -450,6 +451,7 @@ class TensorNameMap:
             "model.layers.{bid}.feed_forward.shared_expert.up_proj", # llama4
             "model.layers.{bid}.feed_forward.down_proj",
             "model.layers.{bid}.mlp.shared_mlp.up_proj",             # hunyuan
+            "model.layers.{bid}.block_sparse_moe.shared_experts.up_proj", # kimi
         ),
 
         MODEL_TENSOR.FFN_UP_CHEXP: (
@@ -500,6 +502,7 @@ class TensorNameMap:
 
         MODEL_TENSOR.FFN_GATE_CHEXP: (
             "model.layers.{bid}.mlp.chunk_experts.gate_proj",           # grovemoe
+            "model.layers.{bid}.block_sparse_moe.shared_experts.gate_proj", # kimi
         ),
 
         # Feed-forward down
@@ -557,6 +560,7 @@ class TensorNameMap:
             "model.layers.{bid}.feed_forward.shared_expert.down_proj", # llama4
             "model.layers.{bid}.shared_mlp.output_linear",             # granitemoe
             "model.layers.{bid}.mlp.shared_mlp.down_proj",             # hunyuan
+            "model.layers.{bid}.block_sparse_moe.shared_experts.down_proj", # kimi
         ),
 
         MODEL_TENSOR.FFN_DOWN_CHEXP: (
@@ -738,6 +742,7 @@ class TensorNameMap:
             "model.layers.{bid}.mamba.norm",        # falcon-h1 granite-hybrid
             "model.layers.{bid}.linear_attn.norm",  # qwen3next
             "backbone.layers.{bid}.mixer.norm",     # mamba2
+            "model.layers.{bid}.self_attn.o_norm",  # kimi
         ),
 
         MODEL_TENSOR.SSM_OUT: (
@@ -1569,6 +1574,38 @@ class TensorNameMap:
             "audio.multi_modal_projector.ln_mid", # ultravox
         ),
 
+        # Kimi Linear KDA (using SSM_ prefix for consistency)
+        MODEL_TENSOR.SSM_CONV1D_Q: (
+            "model.layers.{bid}.self_attn.q_conv1d",
+        ),  
+        MODEL_TENSOR.SSM_CONV1D_K: (
+            "model.layers.{bid}.self_attn.k_conv1d",
+        ),
+        MODEL_TENSOR.SSM_CONV1D_V: (
+            "model.layers.{bid}.self_attn.v_conv1d",
+        ),  
+        MODEL_TENSOR.SSM_F_A: (
+            "model.layers.{bid}.self_attn.f_a_proj",
+        ),
+        MODEL_TENSOR.SSM_F_B: (
+            "model.layers.{bid}.self_attn.f_b_proj",
+        ),
+        MODEL_TENSOR.SSM_BETA: (
+            "model.layers.{bid}.self_attn.b_proj",
+        ),
+        MODEL_TENSOR.SSM_A_LOG: (
+            "model.layers.{bid}.self_attn.A_log",
+        ),
+        MODEL_TENSOR.SSM_G_A: (
+            "model.layers.{bid}.self_attn.g_a_proj",
+        ),
+        MODEL_TENSOR.SSM_G_B: (
+            "model.layers.{bid}.self_attn.g_b_proj",
+        ),
+        MODEL_TENSOR.SSM_DT_B: (
+            "model.layers.{bid}.self_attn.dt_bias",
+        ),
+
         # NextN/MTP tensors for GLM4_MOE
         MODEL_TENSOR.NEXTN_EH_PROJ: (
             "model.layers.{bid}.eh_proj",
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index fbd538109b..fbfcf05c70 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -82,6 +82,7 @@ add_library(llama
             models/internlm2.cpp
             models/jais.cpp
             models/jamba.cpp
+            models/kimi-linear.cpp
             models/lfm2.cpp
             models/llada-moe.cpp
             models/llada.cpp

From 6167f39e08173783b8564bdeafb295aa0ddc4574 Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Tue, 2 Dec 2025 11:14:34 +0800
Subject: [PATCH 04/58] Kimi Linear ggml.h

---
 ggml/include/ggml.h            | 23 +++++++++++++++++++++++
 gguf-py/gguf/tensor_mapping.py |  1 +
 2 files changed, 24 insertions(+)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 48da68fe7e..888f00c2e8 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -539,6 +539,7 @@ extern "C" {
         GGML_OP_FLASH_ATTN_BACK,
         GGML_OP_SSM_CONV,
         GGML_OP_SSM_SCAN,
+        GGML_OP_KDA_SCAN,
         GGML_OP_WIN_PART,
         GGML_OP_WIN_UNPART,
         GGML_OP_GET_REL_POS,
@@ -2337,6 +2338,28 @@ extern "C" {
             struct ggml_tensor  * C,
             struct ggml_tensor  * ids);
 
+    // KDA (Kimi Delta Attention) scan
+    // Delta attention recurrence:
+    //   h[t] = exp(g[t]) * h[t-1] + k[t]^T * (beta[t] * (v[t] - h[t-1] @ k[t]))
+    //   o[t] = q[t]^T @ h[t]
+    // Parameters:
+    //   h:    hidden state {head_dim, head_dim, n_head, n_seqs+}
+    //   q:    query        {head_dim, n_head, n_seq_tokens, n_seqs}
+    //   k:    key          {head_dim, n_head, n_seq_tokens, n_seqs}
+    //   v:    value        {head_dim, n_head, n_seq_tokens, n_seqs}
+    //   g:    gate         {head_dim, n_head, n_seq_tokens, n_seqs}
+    //   beta: mixing       {n_head, n_seq_tokens, n_seqs}
+    //   ids:  seq indices  {n_seqs}
+    GGML_API struct ggml_tensor * ggml_kda_scan(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * h,
+            struct ggml_tensor  * q,
+            struct ggml_tensor  * k,
+            struct ggml_tensor  * v,
+            struct ggml_tensor  * g,
+            struct ggml_tensor  * beta,
+            struct ggml_tensor  * ids);
+
     // partition into non-overlapping windows with padding if needed
     // example:
     // a:   768   64   64    1
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index cfe541fc41..8774ef2dfd 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -498,6 +498,7 @@ class TensorNameMap:
             "model.layers.{bid}.mlp.shared_experts.gate_proj",         # deepseek deepseek2
             "model.layers.{bid}.feed_forward.shared_expert.gate_proj", # llama4
             "model.layers.{bid}.mlp.shared_mlp.gate_proj",             # hunyuan
+            "model.layers.{bid}.block_sparse_moe.shared_experts.gate_proj", # kimi
         ),
 
         MODEL_TENSOR.FFN_GATE_CHEXP: (

From 26a6553155cb735c67b1db01f3901404ee0b8c9e Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Tue, 2 Dec 2025 11:20:46 +0800
Subject: [PATCH 05/58] kimi linear ggml-cpu

---
 ggml/src/ggml-cpu/ggml-cpu.c |   5 +
 ggml/src/ggml-cpu/ops.cpp    | 196 +++++++++++++++++++++++++++++++++++
 ggml/src/ggml-cpu/ops.h      |   1 +
 3 files changed, 202 insertions(+)

diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 3247af8bb0..7b40f1e8c2 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1962,6 +1962,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_ssm_scan(params, tensor);
             } break;
+        case GGML_OP_KDA_SCAN:
+            {
+                ggml_compute_forward_kda_scan(params, tensor);
+            } break;
         case GGML_OP_WIN_PART:
             {
                 ggml_compute_forward_win_part(params, tensor);
@@ -2320,6 +2324,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
         case GGML_OP_FLASH_ATTN_BACK:
         case GGML_OP_SSM_CONV:
         case GGML_OP_SSM_SCAN:
+        case GGML_OP_KDA_SCAN:
         case GGML_OP_RWKV_WKV6:
         case GGML_OP_GATED_LINEAR_ATTN:
         case GGML_OP_RWKV_WKV7:
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 608e82af69..9c93e0c101 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -8686,6 +8686,9 @@ static void ggml_compute_forward_ssm_conv_f32(
     const int ir1 = MIN(ir0 + dr, nr);
     const int ir  = ir1 - ir0;
 
+    static int conv_debug_count = 0;
+    bool do_conv_debug = false; // (ith == 0 && conv_debug_count++ < 3);
+
     for (int i3 = 0; i3 < n_s; ++i3) {
         for (int i2 = 0; i2 < n_t; ++i2) {
             // {d_conv - 1 + n_t, d_inner, n_seqs}
@@ -8706,6 +8709,13 @@ static void ggml_compute_forward_ssm_conv_f32(
                     sumf += s[i0 + i1*ncs] * c[i0 + i1*nc];
                 }
                 x[i1] = sumf;
+
+                // Debug output
+                if (do_conv_debug && i1 == 0 && i2 == 0 && i3 == 0) {
+                    fprintf(stderr, "DEBUG SSM_CONV: nc=%d, nr=%d, n_t=%d, n_s=%d\n", nc, nr, n_t, n_s);
+                    fprintf(stderr, "DEBUG SSM_CONV: s[0..3]=%f,%f,%f,%f, c[0..3]=%f,%f,%f,%f, x[0]=%f\n",
+                            s[0], s[1], s[2], s[3], c[0], c[1], c[2], c[3], x[0]);
+                }
             }
         }
     }
@@ -8956,6 +8966,192 @@ void ggml_compute_forward_ssm_scan(
     }
 }
 
+// ggml_compute_forward_kda_scan
+// KDA (Kimi Delta Attention) recurrence:
+//   h[t] = exp(g[t]) * h[t-1] + k[t]^T * (beta[t] * (v[t] - h[t-1] @ k[t]))
+//   o[t] = q[t]^T @ h[t]
+
+static void ggml_compute_forward_kda_scan_f32(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0]; // h    {head_dim, head_dim, n_head, n_seqs+}
+    const ggml_tensor * src1 = dst->src[1]; // q    {head_dim, n_head, n_seq_tokens, n_seqs}
+    const ggml_tensor * src2 = dst->src[2]; // k    {head_dim, n_head, n_seq_tokens, n_seqs}
+    const ggml_tensor * src3 = dst->src[3]; // v    {head_dim, n_head, n_seq_tokens, n_seqs}
+    const ggml_tensor * src4 = dst->src[4]; // g    {head_dim, n_head, n_seq_tokens, n_seqs}
+    const ggml_tensor * src5 = dst->src[5]; // beta {n_head, n_seq_tokens, n_seqs}
+    const ggml_tensor * src6 = dst->src[6]; // ids  {n_seqs}
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t head_dim     = src0->ne[0];
+    const int64_t n_head       = src1->ne[1];
+    const int64_t n_seq_tokens = src1->ne[2];
+    const int64_t n_seqs       = src1->ne[3];
+
+    // Output offset for hidden state
+    const int64_t y_off = ggml_nelements(src1) * sizeof(float);
+
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+    GGML_ASSERT(src1->nb[0] == sizeof(float));
+    GGML_ASSERT(src2->nb[0] == sizeof(float));
+    GGML_ASSERT(src3->nb[0] == sizeof(float));
+    GGML_ASSERT(src4->nb[0] == sizeof(float));
+    GGML_ASSERT(src5->nb[0] == sizeof(float));
+    GGML_ASSERT(src6->nb[0] == sizeof(int32_t));
+
+    // Parallelize over heads
+    const int dh = (n_head + nth - 1) / nth;
+    const int ih0 = dh * ith;
+    const int ih1 = MIN(ih0 + dh, (int)n_head);
+
+    const int32_t * ids = (const int32_t *) src6->data;
+
+    // Temporary buffer for h @ k computation
+    float * hk_buf = (float *) malloc(head_dim * sizeof(float));
+
+    static int debug_count = 0;
+    bool do_debug = false; // (ith == 0 && debug_count++ < 20);
+
+    for (int i3 = 0; i3 < n_seqs; ++i3) {
+        // Get initial hidden state for this sequence
+        const float * h0 = (const float *) ((const char *) src0->data + ids[i3] * src0->nb[3]);
+        // Output hidden state location
+        float * h_out = (float *) ((char *) dst->data + i3 * src0->nb[3] + y_off);
+
+        for (int ih = ih0; ih < ih1; ++ih) {
+            // Per-head hidden state: [head_dim, head_dim]
+            // Copy initial state to output (will be updated in place)
+            const float * h_in = h0 + ih * head_dim * head_dim;
+            float * h = h_out + ih * head_dim * head_dim;
+
+            // Copy initial state, but check for invalid values and clear if needed
+            bool need_clear = false;
+            for (int i = 0; i < head_dim * head_dim && !need_clear; ++i) {
+                if (!isfinite(h_in[i]) || fabsf(h_in[i]) > 1e6f) {
+                    need_clear = true;
+                }
+            }
+            for (int i = 0; i < head_dim * head_dim; ++i) {
+                h[i] = need_clear ? 0.0f : h_in[i];
+            }
+
+            for (int it = 0; it < n_seq_tokens; ++it) {
+                const float * q_raw = (const float *) ((const char *) src1->data + 
+                    it * src1->nb[2] + i3 * src1->nb[3]) + ih * head_dim;
+                const float * k_raw = (const float *) ((const char *) src2->data + 
+                    it * src2->nb[2] + i3 * src2->nb[3]) + ih * head_dim;
+                const float * v = (const float *) ((const char *) src3->data + 
+                    it * src3->nb[2] + i3 * src3->nb[3]) + ih * head_dim;
+                const float * g = (const float *) ((const char *) src4->data + 
+                    it * src4->nb[2] + i3 * src4->nb[3]) + ih * head_dim;
+                const float beta = ((const float *) ((const char *) src5->data + 
+                    it * src5->nb[1] + i3 * src5->nb[2]))[ih];
+
+                float * y = (float *) dst->data + 
+                    it * n_head * head_dim + i3 * n_seq_tokens * n_head * head_dim + ih * head_dim;
+
+                // L2 normalize q and k (critical for KDA stability)
+                float q_norm = 0.0f, k_norm = 0.0f;
+                for (int i = 0; i < head_dim; ++i) {
+                    q_norm += q_raw[i] * q_raw[i];
+                    k_norm += k_raw[i] * k_raw[i];
+                }
+                q_norm = sqrtf(q_norm + 1e-6f);
+                k_norm = sqrtf(k_norm + 1e-6f);
+
+                // Debug output
+                if (do_debug && ih == 0 && it == 0 && i3 == 0) {
+                    fprintf(stderr, "DEBUG KDA: q_raw[0]=%f, k_raw[0]=%f, v[0]=%f, g[0]=%f, beta=%f\n",
+                            q_raw[0], k_raw[0], v[0], g[0], beta);
+                    fprintf(stderr, "DEBUG KDA: q_norm=%f, k_norm=%f, exp(g[0])=%f, scale=%f\n",
+                            q_norm, k_norm, expf(g[0]), 1.0f / sqrtf((float)head_dim));
+                }
+
+                // Normalized q and k with scale = 1/sqrt(head_dim)
+                // Note: scale is applied only to q after L2 normalization
+                const float scale = 1.0f / sqrtf((float)head_dim);
+                float q[128], k[128];  // assume head_dim <= 128
+                for (int i = 0; i < head_dim; ++i) {
+                    // L2 normalize then scale q
+                    q[i] = (q_raw[i] / q_norm) * scale;
+                    // L2 normalize k (no scale)
+                    k[i] = k_raw[i] / k_norm;
+                }
+
+                // KDA recurrence: h[t] = exp(g[t]) * h[t-1] + k[t]^T * (beta[t] * (v[t] - h[t-1] @ k[t]))
+                // Note: Apply decay first, then compute retrieval and update
+
+                // Step 1: Apply decay to h first: h = h * exp(g)
+                for (int i = 0; i < head_dim; ++i) {
+                    const float exp_gi = expf(g[i]);
+                    for (int j = 0; j < head_dim; ++j) {
+                        h[i * head_dim + j] *= exp_gi;
+                    }
+                }
+
+                // Step 2: Compute h^T @ k -> hk_buf [head_dim]
+                // hk_buf[j] = sum_i (h[i,j] * k[i]) which is column j of h dotted with k
+                for (int j = 0; j < head_dim; ++j) {
+                    float sum = 0.0f;
+                    for (int i = 0; i < head_dim; ++i) {
+                        sum += h[i * head_dim + j] * k[i];
+                    }
+                    hk_buf[j] = sum;
+                }
+
+                // Step 3: Compute delta = beta * (v - hk) and update h
+                // h = h + outer(k, delta) where outer(k,delta)[i,j] = k[i] * delta[j]
+                for (int i = 0; i < head_dim; ++i) {
+                    for (int j = 0; j < head_dim; ++j) {
+                        const float delta_j = beta * (v[j] - hk_buf[j]);
+                        h[i * head_dim + j] += k[i] * delta_j;
+                    }
+                }
+
+                // Step 4: Compute output y = h^T @ q -> [head_dim]
+                // vLLM: b_o = tl.sum(b_h * b_q[:, None], 0) means o[j] = sum_i(h[i,j] * q[i])
+                for (int j = 0; j < head_dim; ++j) {
+                    float sum = 0.0f;
+                    for (int i = 0; i < head_dim; ++i) {
+                        sum += h[i * head_dim + j] * q[i];
+                    }
+                    y[j] = sum;
+                }
+
+                // Debug output
+                if (do_debug && ih == 0 && it == 0 && i3 == 0) {
+                    // Find max abs value in h for stability check
+                    float h_max = 0.0f;
+                    for (int i = 0; i < head_dim * head_dim; i++) {
+                        if (fabsf(h[i]) > h_max) h_max = fabsf(h[i]);
+                    }
+                    fprintf(stderr, "DEBUG KDA: y[0]=%.6f, h_max=%.6f, exp(g[0])=%.6f\n",
+                            y[0], h_max, expf(g[0]));
+                }
+            }
+        }
+    }
+
+    free(hk_buf);
+}
+
+void ggml_compute_forward_kda_scan(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+    switch (dst->src[0]->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_kda_scan_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
 // ggml_compute_forward_win_part
 
 static void ggml_compute_forward_win_part_f32(
diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
index 0fdfee7976..080cf6e090 100644
--- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
@@ -92,6 +92,7 @@ void ggml_compute_forward_flash_attn_back(
         struct ggml_tensor * dst);
 void ggml_compute_forward_ssm_conv(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_ssm_scan(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_kda_scan(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_win_part(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_win_unpart(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_unary(const struct ggml_compute_params * params, struct ggml_tensor * dst);

From bf42bc06069f67a520bb9c4d5793e93000df6df9 Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Tue, 2 Dec 2025 11:24:37 +0800
Subject: [PATCH 06/58] Kimi Linear ggml-cuda

---
 ggml/src/ggml-cuda/ggml-cuda.cu |   9 ++
 ggml/src/ggml-cuda/kda-scan.cu  | 209 ++++++++++++++++++++++++++++++++
 ggml/src/ggml-cuda/kda-scan.cuh |   3 +
 3 files changed, 221 insertions(+)
 create mode 100644 ggml/src/ggml-cuda/kda-scan.cu
 create mode 100644 ggml/src/ggml-cuda/kda-scan.cuh

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index eb2e273110..8e0d7d916e 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -41,6 +41,7 @@
 #include "ggml-cuda/softmax.cuh"
 #include "ggml-cuda/ssm-conv.cuh"
 #include "ggml-cuda/ssm-scan.cuh"
+#include "ggml-cuda/kda-scan.cuh"
 #include "ggml-cuda/sum.cuh"
 #include "ggml-cuda/sumrows.cuh"
 #include "ggml-cuda/mean.cuh"
@@ -2692,6 +2693,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_SSM_SCAN:
             ggml_cuda_op_ssm_scan(ctx, dst);
             break;
+        case GGML_OP_KDA_SCAN:
+            ggml_cuda_op_kda_scan(ctx, dst);
+            break;
         case GGML_OP_ARGSORT:
             ggml_cuda_op_argsort(ctx, dst);
             break;
@@ -4503,6 +4507,11 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                 return op->src[0]->ne[0] == 16 && op->src[0]->ne[1] == 1 && op->src[0]->ne[2] % 128 == 0 && op->src[4]->ne[1] == 1;
             }
         }
+        case GGML_OP_KDA_SCAN: {
+            // KDA scan kernel supports head_dim 64 or 128
+            const int64_t head_dim = op->src[0]->ne[0];
+            return head_dim == 64 || head_dim == 128;
+        }
         case GGML_OP_SSM_CONV: {
             // assumes d_inner % threads == 0
             return op->src[0]->ne[1] % 128 == 0;
diff --git a/ggml/src/ggml-cuda/kda-scan.cu b/ggml/src/ggml-cuda/kda-scan.cu
new file mode 100644
index 0000000000..5763f1cc90
--- /dev/null
+++ b/ggml/src/ggml-cuda/kda-scan.cu
@@ -0,0 +1,209 @@
+#include "kda-scan.cuh"
+
+// KDA (Kimi Delta Attention) scan CUDA kernel
+// Recurrence:
+//   h[t] = exp(g[t]) * h[t-1] + k[t]^T * (beta[t] * (v[t] - h[t-1] @ k[t]))
+//   o[t] = q[t]^T @ h[t]
+// 
+// This kernel uses global memory for the hidden state to avoid shared memory limits.
+// Each block processes one head for one sequence.
+
+__global__ void kda_scan_f32_kernel(
+    const float * __restrict__ src0,   // h:    [head_dim, head_dim, n_head, n_seqs+]
+    const float * __restrict__ src1,   // q:    [head_dim, n_head, n_seq_tokens, n_seqs]
+    const float * __restrict__ src2,   // k:    [head_dim, n_head, n_seq_tokens, n_seqs]
+    const float * __restrict__ src3,   // v:    [head_dim, n_head, n_seq_tokens, n_seqs]
+    const float * __restrict__ src4,   // g:    [head_dim, n_head, n_seq_tokens, n_seqs]
+    const float * __restrict__ src5,   // beta: [n_head, n_seq_tokens, n_seqs]
+    const int32_t * __restrict__ src6, // ids:  [n_seqs]
+    float * __restrict__ dst,
+    const int64_t head_dim,
+    const int64_t n_head,
+    const int64_t n_seq_tokens,
+    const int64_t n_seqs,
+    const int64_t y_off)  // offset to state output in dst (in floats)
+{
+    // Each block handles one head for one sequence
+    const int seq_idx = blockIdx.x / n_head;
+    const int head_idx = blockIdx.x % n_head;
+    const int tid = threadIdx.x;
+    const int n_threads = blockDim.x;
+    
+    if (seq_idx >= n_seqs || head_idx >= n_head) return;
+    
+    // Get sequence ID for initial state
+    const int src_seq = src6[seq_idx];
+    
+    // Shared memory for temporary buffers
+    extern __shared__ float smem[];
+    float * hk_buf = smem;                    // [head_dim] - h @ k buffer
+    float * q_norm = smem + head_dim;         // [head_dim] - normalized q
+    float * k_norm = q_norm + head_dim;       // [head_dim] - normalized k
+    float * warp_sums = k_norm + head_dim;    // [64] - for reductions
+    
+    // Pointers to input/output data for this head
+    const int64_t h_stride_head = head_dim * head_dim;
+    const int64_t h_stride_seq = h_stride_head * n_head;
+    const int64_t qkv_stride_head = head_dim;
+    const int64_t qkv_stride_token = head_dim * n_head;
+    const int64_t qkv_stride_seq = qkv_stride_token * n_seq_tokens;
+    const int64_t beta_stride_token = n_head;
+    const int64_t beta_stride_seq = beta_stride_token * n_seq_tokens;
+    
+    const float * h_in = src0 + src_seq * h_stride_seq + head_idx * h_stride_head;
+    float * h_out = dst + y_off + seq_idx * h_stride_seq + head_idx * h_stride_head;
+    float * y_out = dst + seq_idx * qkv_stride_seq + head_idx * qkv_stride_head;
+    
+    // Copy initial state to output (we'll update in place)
+    for (int i = tid; i < head_dim * head_dim; i += n_threads) {
+        float val = h_in[i];
+        if (!isfinite(val) || fabsf(val) > 1e6f) {
+            val = 0.0f;
+        }
+        h_out[i] = val;
+    }
+    __syncthreads();
+    
+    const float scale = 1.0f / sqrtf((float)head_dim);
+    
+    // Process each token sequentially
+    for (int t = 0; t < n_seq_tokens; ++t) {
+        const float * q_raw = src1 + t * qkv_stride_token + seq_idx * qkv_stride_seq + head_idx * qkv_stride_head;
+        const float * k_raw = src2 + t * qkv_stride_token + seq_idx * qkv_stride_seq + head_idx * qkv_stride_head;
+        const float * v = src3 + t * qkv_stride_token + seq_idx * qkv_stride_seq + head_idx * qkv_stride_head;
+        const float * g = src4 + t * qkv_stride_token + seq_idx * qkv_stride_seq + head_idx * qkv_stride_head;
+        const float beta = src5[t * beta_stride_token + seq_idx * beta_stride_seq + head_idx];
+        float * y = y_out + t * qkv_stride_token;
+        
+        // Step 1: L2 normalize q and k
+        float q_sq_sum = 0.0f, k_sq_sum = 0.0f;
+        for (int i = tid; i < head_dim; i += n_threads) {
+            q_sq_sum += q_raw[i] * q_raw[i];
+            k_sq_sum += k_raw[i] * k_raw[i];
+        }
+        
+        // Warp reduction
+        for (int offset = warpSize/2; offset > 0; offset /= 2) {
+            q_sq_sum += __shfl_down_sync(0xffffffff, q_sq_sum, offset);
+            k_sq_sum += __shfl_down_sync(0xffffffff, k_sq_sum, offset);
+        }
+        
+        // Cross-warp reduction
+        int warp_id = tid / warpSize;
+        int lane_id = tid % warpSize;
+        if (lane_id == 0 && warp_id < 32) {
+            warp_sums[warp_id] = q_sq_sum;
+            warp_sums[32 + warp_id] = k_sq_sum;
+        }
+        __syncthreads();
+        
+        if (tid == 0) {
+            float total_q = 0.0f, total_k = 0.0f;
+            for (int i = 0; i < (n_threads + warpSize - 1) / warpSize; ++i) {
+                total_q += warp_sums[i];
+                total_k += warp_sums[32 + i];
+            }
+            warp_sums[0] = rsqrtf(total_q + 1e-6f) * scale;
+            warp_sums[1] = rsqrtf(total_k + 1e-6f);
+        }
+        __syncthreads();
+        
+        float q_norm_factor = warp_sums[0];
+        float k_norm_factor = warp_sums[1];
+        
+        // Store normalized q and k
+        for (int i = tid; i < head_dim; i += n_threads) {
+            q_norm[i] = q_raw[i] * q_norm_factor;
+            k_norm[i] = k_raw[i] * k_norm_factor;
+        }
+        __syncthreads();
+        
+        // KDA recurrence: h[t] = exp(g[t]) * h[t-1] + k[t]^T * (beta[t] * (v[t] - h[t-1] @ k[t]))
+        // Apply decay first, then compute retrieval and update
+        
+        // Step 2: Apply decay to h: h = h * exp(g)
+        for (int idx = tid; idx < head_dim * head_dim; idx += n_threads) {
+            int i = idx / head_dim;
+            float exp_gi = expf(g[i]);
+            h_out[idx] *= exp_gi;
+        }
+        __syncthreads();
+        
+        // Step 3: Compute h^T @ k -> hk_buf
+        for (int j = tid; j < head_dim; j += n_threads) {
+            float sum = 0.0f;
+            for (int i = 0; i < head_dim; ++i) {
+                sum += h_out[i * head_dim + j] * k_norm[i];
+            }
+            hk_buf[j] = sum;
+        }
+        __syncthreads();
+        
+        // Step 4: Update h: h = h + outer(k, beta * (v - hk))
+        for (int idx = tid; idx < head_dim * head_dim; idx += n_threads) {
+            int i = idx / head_dim;
+            int j = idx % head_dim;
+            float delta_j = beta * (v[j] - hk_buf[j]);
+            h_out[idx] += k_norm[i] * delta_j;
+        }
+        __syncthreads();
+        
+        // Step 5: Compute output y = h^T @ q
+        for (int j = tid; j < head_dim; j += n_threads) {
+            float sum = 0.0f;
+            for (int i = 0; i < head_dim; ++i) {
+                sum += h_out[i * head_dim + j] * q_norm[i];
+            }
+            y[j] = sum;
+        }
+        __syncthreads();
+    }
+}
+
+void ggml_cuda_op_kda_scan(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0]; // h
+    const ggml_tensor * src1 = dst->src[1]; // q
+    const ggml_tensor * src2 = dst->src[2]; // k
+    const ggml_tensor * src3 = dst->src[3]; // v
+    const ggml_tensor * src4 = dst->src[4]; // g
+    const ggml_tensor * src5 = dst->src[5]; // beta
+    const ggml_tensor * src6 = dst->src[6]; // ids
+    
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(src2->type == GGML_TYPE_F32);
+    GGML_ASSERT(src3->type == GGML_TYPE_F32);
+    GGML_ASSERT(src4->type == GGML_TYPE_F32);
+    GGML_ASSERT(src5->type == GGML_TYPE_F32);
+    GGML_ASSERT(src6->type == GGML_TYPE_I32);
+    
+    const int64_t head_dim = src0->ne[0];
+    const int64_t n_head = src1->ne[1];
+    const int64_t n_seq_tokens = src1->ne[2];
+    const int64_t n_seqs = src1->ne[3];
+    
+    // Output offset for hidden state (after attention output) - in floats
+    const int64_t y_off = ggml_nelements(src1);
+    
+    const float * h_d = (const float *)src0->data;
+    const float * q_d = (const float *)src1->data;
+    const float * k_d = (const float *)src2->data;
+    const float * v_d = (const float *)src3->data;
+    const float * g_d = (const float *)src4->data;
+    const float * beta_d = (const float *)src5->data;
+    const int32_t * ids_d = (const int32_t *)src6->data;
+    float * dst_d = (float *)dst->data;
+    
+    cudaStream_t stream = ctx.stream();
+    
+    // Launch kernel: one block per (sequence, head) pair
+    const int n_blocks = n_seqs * n_head;
+    const int n_threads = 128;
+    
+    // Shared memory: hk_buf[head_dim] + q_norm[head_dim] + k_norm[head_dim] + warp_sums[64]
+    size_t smem_size = (3 * head_dim + 64) * sizeof(float);
+    
+    kda_scan_f32_kernel<<<n_blocks, n_threads, smem_size, stream>>>(
+        h_d, q_d, k_d, v_d, g_d, beta_d, ids_d, dst_d,
+        head_dim, n_head, n_seq_tokens, n_seqs, y_off);
+}
diff --git a/ggml/src/ggml-cuda/kda-scan.cuh b/ggml/src/ggml-cuda/kda-scan.cuh
new file mode 100644
index 0000000000..55783fb82b
--- /dev/null
+++ b/ggml/src/ggml-cuda/kda-scan.cuh
@@ -0,0 +1,3 @@
+#include "common.cuh"
+
+void ggml_cuda_op_kda_scan(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

From d73d3e51a51d3a58e62471c058f58a1dd4dbae2b Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Tue, 2 Dec 2025 11:27:57 +0800
Subject: [PATCH 07/58] Kimi Linear ggml.c

---
 ggml/src/ggml.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 66 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 17cf4d84bb..8bf562e8b1 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -999,6 +999,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "FLASH_ATTN_BACK",
     "SSM_CONV",
     "SSM_SCAN",
+    "KDA_SCAN",
     "WIN_PART",
     "WIN_UNPART",
     "GET_REL_POS",
@@ -1024,7 +1025,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "GLU",
 };
 
-static_assert(GGML_OP_COUNT == 95, "GGML_OP_COUNT != 95");
+static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -5434,6 +5435,70 @@ struct ggml_tensor * ggml_ssm_scan(
     return result;
 }
 
+// ggml_kda_scan
+
+struct ggml_tensor * ggml_kda_scan(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * h,
+        struct ggml_tensor  * q,
+        struct ggml_tensor  * k,
+        struct ggml_tensor  * v,
+        struct ggml_tensor  * g,
+        struct ggml_tensor  * beta,
+        struct ggml_tensor  * ids) {
+    GGML_ASSERT(ggml_is_contiguous(h));
+    GGML_ASSERT(ggml_is_contiguous(q));
+    GGML_ASSERT(ggml_is_contiguous(k));
+    GGML_ASSERT(ggml_is_contiguous(v));
+    GGML_ASSERT(ggml_is_contiguous(g));
+    GGML_ASSERT(ggml_is_contiguous(beta));
+    GGML_ASSERT(ids->type == GGML_TYPE_I32);
+
+    {
+        const int64_t head_dim     = h->ne[0];
+        const int64_t n_head       = q->ne[1];
+        const int64_t n_seq_tokens = q->ne[2];
+        const int64_t n_seqs       = q->ne[3];
+
+        GGML_ASSERT(h->ne[0] == head_dim);
+        GGML_ASSERT(h->ne[1] == head_dim);
+        GGML_ASSERT(h->ne[2] == n_head);
+        GGML_ASSERT(q->ne[0] == head_dim);
+        GGML_ASSERT(k->ne[0] == head_dim);
+        GGML_ASSERT(v->ne[0] == head_dim);
+        GGML_ASSERT(g->ne[0] == head_dim);
+        GGML_ASSERT(ggml_are_same_shape(q, k));
+        GGML_ASSERT(ggml_are_same_shape(q, v));
+        GGML_ASSERT(ggml_are_same_shape(q, g));
+        GGML_ASSERT(beta->ne[0] == n_head);
+        GGML_ASSERT(beta->ne[1] == n_seq_tokens);
+        GGML_ASSERT(beta->ne[2] == n_seqs);
+        GGML_ASSERT(ids->ne[0] == n_seqs);
+        GGML_ASSERT(ggml_is_vector(ids));
+    }
+
+    // Output: y (attention output) + updated hidden states
+    // y: {head_dim, n_head, n_seq_tokens, n_seqs}
+    // h_new: {head_dim, head_dim, n_head, n_seqs}
+    const int64_t head_dim = h->ne[0];
+    const int64_t n_head = q->ne[1];
+    const int64_t n_seq_tokens = q->ne[2];
+    const int64_t n_seqs = q->ne[3];
+    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 
+        ggml_nelements(q) + head_dim * head_dim * n_head * n_seqs);
+
+    result->op   = GGML_OP_KDA_SCAN;
+    result->src[0] = h;
+    result->src[1] = q;
+    result->src[2] = k;
+    result->src[3] = v;
+    result->src[4] = g;
+    result->src[5] = beta;
+    result->src[6] = ids;
+
+    return result;
+}
+
 // ggml_win_part
 
 struct ggml_tensor * ggml_win_part(

From e308026f647342f0553cf734d028b8be61a8ed48 Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Tue, 2 Dec 2025 12:02:35 +0800
Subject: [PATCH 08/58] kimi linear src/llama

---
 src/llama-arch.cpp    |  64 ++++++++++++++
 src/llama-arch.h      |  12 +++
 src/llama-context.cpp |   2 +-
 src/llama-graph.cpp   |  13 +--
 src/llama-hparams.cpp |  14 +++
 src/llama-hparams.h   |   4 +
 src/llama-model.cpp   | 195 ++++++++++++++++++++++++++++++++++++++++++
 src/llama-model.h     |  18 ++++
 src/llama-quant.cpp   |   4 +-
 src/llama-vocab.cpp   |  41 +++++----
 10 files changed, 343 insertions(+), 24 deletions(-)

diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index e12c8b9250..ab09bb7eb7 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -112,6 +112,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_RND1,             "rnd1"             },
     { LLM_ARCH_PANGU_EMBED,      "pangu-embedded"   },
     { LLM_ARCH_MISTRAL3,         "mistral3"         },
+    { LLM_ARCH_KIMI_LINEAR,      "kimi-linear"      },
     { LLM_ARCH_UNKNOWN,          "(unknown)"        },
 };
 
@@ -2540,6 +2541,54 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
         },
     },
+    {
+        LLM_ARCH_KIMI_LINEAR,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,         "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,        "output_norm" },
+            { LLM_TENSOR_OUTPUT,             "output" },
+            { LLM_TENSOR_ROPE_FREQS,         "rope_freqs" },
+            { LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,             "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,             "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },
+            // Dense FFN (layer 0 only)
+            { LLM_TENSOR_FFN_GATE,           "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,           "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,             "blk.%d.ffn_up" },
+            // MoE FFN (layers 1+)
+            { LLM_TENSOR_FFN_GATE_INP,       "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_GATE_EXPS,      "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,      "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,        "blk.%d.ffn_up_exps" },
+            { LLM_TENSOR_FFN_EXP_PROBS_B,    "blk.%d.exp_probs_b" },
+            // Shared experts
+            { LLM_TENSOR_FFN_GATE_SHEXP,     "blk.%d.ffn_gate_shexp" },
+            { LLM_TENSOR_FFN_DOWN_SHEXP,     "blk.%d.ffn_down_shexp" },
+            { LLM_TENSOR_FFN_UP_SHEXP,       "blk.%d.ffn_up_shexp" },
+            // KDA (using SSM_ enum prefix, keeping GGUF names for backward compat)
+            { LLM_TENSOR_SSM_CONV1D_Q,       "blk.%d.ssm_conv1d_q" },
+            { LLM_TENSOR_SSM_CONV1D_K,       "blk.%d.ssm_conv1d_k" },
+            { LLM_TENSOR_SSM_CONV1D_V,       "blk.%d.ssm_conv1d_v" },
+            { LLM_TENSOR_SSM_F_A,            "blk.%d.ssm_f_a" },
+            { LLM_TENSOR_SSM_F_B,            "blk.%d.ssm_f_b" },
+            { LLM_TENSOR_SSM_BETA,           "blk.%d.ssm_beta" },
+            { LLM_TENSOR_SSM_A_LOG,          "blk.%d.ssm_a" },
+            { LLM_TENSOR_SSM_DT_B,           "blk.%d.ssm_dt" },
+            { LLM_TENSOR_SSM_G_A,            "blk.%d.ssm_g_a" },
+            { LLM_TENSOR_SSM_G_B,            "blk.%d.ssm_g_b" },
+            { LLM_TENSOR_SSM_NORM,           "blk.%d.ssm_norm" },
+            // MLA
+            { LLM_TENSOR_ATTN_Q_A,           "blk.%d.attn_q_a" },
+            { LLM_TENSOR_ATTN_Q_B,           "blk.%d.attn_q_b" },
+            { LLM_TENSOR_ATTN_Q_A_NORM,      "blk.%d.attn_q_a_norm" },
+            { LLM_TENSOR_ATTN_KV_A_MQA,      "blk.%d.attn_kv_a_mqa" },
+            { LLM_TENSOR_ATTN_KV_B,          "blk.%d.attn_kv_b" },
+            { LLM_TENSOR_ATTN_KV_A_NORM,     "blk.%d.attn_kv_a_norm" },
+        },
+    },
     {
         LLM_ARCH_UNKNOWN,
         {
@@ -2644,6 +2693,17 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_SSM_C_NORM,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_SSM_D,                      {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_SSM_NORM,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    // Kimi KDA - Conv tensors are 4D [d_conv, 1, d_inner, 1], reshaped to 2D at runtime
+    {LLM_TENSOR_SSM_CONV1D_Q,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_SSM_CONV1D_K,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_SSM_CONV1D_V,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_SSM_F_A,                    {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_SSM_F_B,                    {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_SSM_BETA,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_SSM_A_LOG,                  {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_SSM_DT_B,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+    {LLM_TENSOR_SSM_G_A,                    {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_SSM_G_B,                    {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_TIME_MIX_LERP_X,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_TIME_MIX_LN,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_CHANNEL_MIX_LERP_K,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
@@ -2801,6 +2861,7 @@ bool llm_arch_is_recurrent(const llm_arch & arch) {
         case LLM_ARCH_RWKV6QWEN2:
         case LLM_ARCH_RWKV7:
         case LLM_ARCH_ARWKV7:
+        case LLM_ARCH_KIMI_LINEAR:  // KDA layers use delta attention with recurrent state
             return true;
         default:
             return false;
@@ -2817,6 +2878,9 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
         case LLM_ARCH_LFM2MOE:
         case LLM_ARCH_NEMOTRON_H:
         case LLM_ARCH_QWEN3NEXT:
+        // Kimi: Currently using recurrent-only mode since MLA doesn't use KV cache
+        // TODO: Enable hybrid when MLA KV caching is implemented
+        // case LLM_ARCH_KIMI_LINEAR:
             return true;
         default:
             return false;
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 438963cef0..2b965850c5 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -116,6 +116,7 @@ enum llm_arch {
     LLM_ARCH_RND1,
     LLM_ARCH_PANGU_EMBED,
     LLM_ARCH_MISTRAL3,
+    LLM_ARCH_KIMI_LINEAR,
     LLM_ARCH_UNKNOWN,
 };
 
@@ -385,6 +386,17 @@ enum llm_tensor {
     LLM_TENSOR_SSM_NORM,
     LLM_TENSOR_SSM_OUT,
     LLM_TENSOR_SSM_BETA_ALPHA,      // qwen3next
+    // Kimi Linear KDA (using SSM_ prefix for consistency)
+    LLM_TENSOR_SSM_CONV1D_Q,        // kimi: Q conv1d weight
+    LLM_TENSOR_SSM_CONV1D_K,        // kimi: K conv1d weight
+    LLM_TENSOR_SSM_CONV1D_V,        // kimi: V conv1d weight
+    LLM_TENSOR_SSM_F_A,             // kimi: forget gate projection A
+    LLM_TENSOR_SSM_F_B,             // kimi: forget gate projection B
+    LLM_TENSOR_SSM_BETA,            // kimi: beta mixing coefficient
+    LLM_TENSOR_SSM_A_LOG,           // kimi: A_log (pre-converted in GGUF)
+    LLM_TENSOR_SSM_DT_B,            // kimi: dt bias
+    LLM_TENSOR_SSM_G_A,             // kimi: output gate projection A
+    LLM_TENSOR_SSM_G_B,             // kimi: output gate projection B
     LLM_TENSOR_TIME_MIX_W0,
     LLM_TENSOR_TIME_MIX_W1,
     LLM_TENSOR_TIME_MIX_W2,
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index e04f0fc4f9..3278cf2ef8 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1387,7 +1387,7 @@ void llama_context::output_reorder() {
 //
 
 uint32_t llama_context::graph_max_nodes() const {
-    if (model.arch == LLM_ARCH_QWEN3NEXT) {
+    if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_KIMI_LINEAR) {
         return std::max<uint32_t>(8192u, 32u*model.n_tensors());
     }
     return std::max<uint32_t>(1024u, 8u*model.n_tensors());
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 42ccb5b76a..e41d65398f 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -1816,11 +1816,14 @@ ggml_tensor * llm_graph_context::build_rs(
     ggml_build_forward_expand(gf, output_states);
 
     // copy extra states which won't be changed further (between n_seqs and n_rs)
-    ggml_tensor * states_extra = ggml_get_rows(ctx0, states, state_copy_extra);
-    ggml_build_forward_expand(gf,
-        ggml_cpy(ctx0,
-            states_extra,
-            ggml_view_1d(ctx0, s, state_size*(n_rs - n_seqs), (rs_head + n_seqs)*state_size*ggml_element_size(s))));
+    // Skip if there are no extra states to copy (n_rs == n_seqs)
+    if (arch != LLM_ARCH_KIMI_LINEAR || n_rs > n_seqs) { // arch check for backward compat
+        ggml_tensor * states_extra = ggml_get_rows(ctx0, states, state_copy_extra);
+        ggml_build_forward_expand(gf,
+            ggml_cpy(ctx0,
+                states_extra,
+                ggml_view_1d(ctx0, s, state_size*(n_rs - n_seqs), (rs_head + n_seqs)*state_size*ggml_element_size(s))));
+    }
 
     return output_states;
 }
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index 8cdbaf69fc..88d266b8da 100644
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -133,6 +133,13 @@ uint32_t llama_hparams::n_embd_r() const {
         return n_embd * (n_shortconv_l_cache - 1);
     }
 
+    if (kda_head_dim != 0) {
+        // for Kimi KDA layers
+        // Conv state for Q, K, V: 3 * (d_conv - 1) * n_head * head_dim
+        const uint32_t d_inner = n_head() * kda_head_dim;  // 32 * 128 = 4096
+        return 3 * (kda_d_conv > 0 ? kda_d_conv - 1 : 3) * d_inner;
+    }
+
     // TODO: maybe support other convolution strides than 1
     // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
     // Corresponds to Mamba's conv_states size
@@ -145,6 +152,13 @@ uint32_t llama_hparams::n_embd_s() const {
         return n_embd * wkv_head_size;
     }
 
+    if (kda_head_dim != 0) {
+        // for Kimi KDA layers
+        // Full recurrent state: head_dim * head_dim * n_head
+        // h tensor shape for delta attention: [head_dim, head_dim, n_head]
+        return kda_head_dim * kda_head_dim * n_head();  // 128 * 128 * 32 = 524288
+    }
+
     // corresponds to Mamba's ssm_states size
     return ssm_d_state * ssm_d_inner;
 }
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 6eff334a5f..80170650eb 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -133,6 +133,10 @@ struct llama_hparams {
     uint32_t ssm_dt_rank = 0;
     uint32_t ssm_n_group = 0;
 
+    // for Kimi Delta Attention (KDA)
+    uint32_t kda_head_dim  = 0;  // head_dim for KDA layers (128 for Kimi)
+    uint32_t kda_d_conv    = 0;  // conv kernel size for KDA (4 for Kimi)
+
     // for hybrid state space models
     std::array<bool, LLAMA_MAX_LAYERS> recurrent_layer_arr;
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 584efbf3c8..763f0dfecb 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2283,6 +2283,54 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     default: type = LLM_TYPE_UNKNOWN;
                 }
             } break;
+        case LLM_ARCH_KIMI_LINEAR:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA,    hparams.n_embd_head_k_mla, false);
+                ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA,  hparams.n_embd_head_v_mla, false);
+                ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK,      hparams.n_lora_kv, false);
+                ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT,        hparams.n_rot, false);
+
+                // KDA (Delta Attention) parameters
+                hparams.kda_head_dim = 128;  // linear_attn_config.head_dim
+                hparams.kda_d_conv = 4;      // linear_attn_config.short_conv_kernel_size
+
+                // MLA qk_rope_head_dim (for reference)
+                // qk_rope_head_dim = 64, qk_nope_head_dim = 128, qk_head_dim = 192
+
+                // Mark KDA layers as recurrent using n_head_kv pattern (like Jamba)
+                // MLA layers are at: 3, 7, 11, 15, 19, 23, 26 (7 MLA layers total)
+                // KDA layers are all others: 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25 (20 KDA layers)
+                // Set n_head_kv = 0 for KDA layers (recurrent), n_head_kv = n_head for MLA layers (attention)
+                for (uint32_t i = 0; i < hparams.n_layer; ++i) {
+                    bool is_mla = (i == 3 || i == 7 || i == 11 || i == 15 || i == 19 || i == 23 || i == 26);
+                    hparams.n_head_kv_arr[i] = is_mla ? hparams.n_head() : 0;
+                    hparams.recurrent_layer_arr[i] = !is_mla;  // KDA layers are recurrent
+                }
+
+                // MoE parameters - Kimi uses moe_intermediate_size = 1024
+                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp, false);
+                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
+                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,               hparams.n_expert_shared, false);
+                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead, false);
+                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
+                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func, false);
+
+                // Default values if not in GGUF
+                if (hparams.n_ff_exp == 0) hparams.n_ff_exp = 1024;  // moe_intermediate_size
+                if (hparams.n_ff_shexp == 0) hparams.n_ff_shexp = 9216;  // shared_expert_intermediate_size = intermediate_size
+                if (hparams.n_expert_shared == 0) hparams.n_expert_shared = 1;  // num_shared_experts
+                if (hparams.n_layer_dense_lead == 0) hparams.n_layer_dense_lead = 1;  // first_k_dense_replace
+                if (hparams.expert_weights_scale == 0.0f) hparams.expert_weights_scale = 2.446f;  // routed_scaling_factor
+
+                // MoE gating function - Kimi uses sigmoid (moe_router_activation_func: sigmoid)
+                if (hparams.expert_gating_func == 0) hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
+
+                switch (hparams.n_layer) {
+                    case 27: type = LLM_TYPE_48B; break; // Kimi-Linear-48B-A3B
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
         default: throw std::runtime_error("unsupported model architecture");
     }
 
@@ -6395,6 +6443,148 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
                     }
                 } break;
+            case LLM_ARCH_KIMI_LINEAR:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+                        // Check for KDA specific tensors to determine layer type or if it's a mixed model
+                        // Assuming KDA layer if KDA tensors are present
+
+                        // KDA uses head_dim = 128 (from linear_attn_config.head_dim)
+                        const int64_t n_embd_head_k_kda = 128;
+                        const int64_t n_embd_head_v_kda = 128;
+                        const int64_t ssm_d_conv = hparams.ssm_d_conv > 0 ? hparams.ssm_d_conv : 4;
+
+                        // Try loading KDA specific tensors (using SSM_ prefix)
+                        // Conv1d weights: try 4D first, then 3D (quantization may remove trailing 1)
+                        // 4D: [d_conv, 1, d_inner, 1], 3D: [d_conv, 1, d_inner]
+                        layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
+                        if (!layer.ssm_q_conv) {
+                            layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head}, TENSOR_NOT_REQUIRED);
+                        }
+
+                        if (layer.ssm_q_conv) {
+                             // KDA Layer - Conv1d weights may be 3D or 4D
+                             layer.ssm_k_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_K, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
+                             if (!layer.ssm_k_conv) {
+                                 layer.ssm_k_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_K, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head}, 0);
+                             }
+                             layer.ssm_v_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_V, "weight", i), {ssm_d_conv, 1, n_embd_head_v_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
+                             if (!layer.ssm_v_conv) {
+                                 layer.ssm_v_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_V, "weight", i), {ssm_d_conv, 1, n_embd_head_v_kda * n_head}, 0);
+                             }
+
+                             // Conv bias may not exist in all models - make optional
+                             layer.ssm_q_conv_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "bias", i), {n_embd_head_k_kda * n_head}, TENSOR_NOT_REQUIRED);
+                             layer.ssm_k_conv_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_K, "bias", i), {n_embd_head_k_kda * n_head}, TENSOR_NOT_REQUIRED);
+                             layer.ssm_v_conv_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_V, "bias", i), {n_embd_head_v_kda * n_head}, TENSOR_NOT_REQUIRED);
+
+                             // q, k, v projections
+                             // Python: q_proj, k_proj, v_proj
+                             layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k_kda * n_head}, 0);
+                             layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_head_k_kda * n_head}, 0);
+                             layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_head_v_kda * n_head}, 0);
+
+                             // KDA specific projections
+                             // f_a_proj, f_b_proj
+                             layer.ssm_f_a = create_tensor(tn(LLM_TENSOR_SSM_F_A, "weight", i), {n_embd, n_embd_head_k_kda}, 0); // head_dim
+                             layer.ssm_f_b = create_tensor(tn(LLM_TENSOR_SSM_F_B, "weight", i), {n_embd_head_k_kda, n_embd_head_k_kda * n_head}, 0); // projection_size
+
+                             // b_proj (beta mixing coefficient)
+                             layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", i), {n_embd, n_head}, 0);
+
+                             // A_log - Shape in GGUF: [1, num_heads, 1, 1] (4D) or [1, num_heads] (2D after quantization)
+                             layer.ssm_a_log = create_tensor(tn(LLM_TENSOR_SSM_A_LOG, i), {1, n_head, 1, 1}, TENSOR_NOT_REQUIRED);
+                             if (!layer.ssm_a_log) {
+                                 layer.ssm_a_log = create_tensor(tn(LLM_TENSOR_SSM_A_LOG, i), {1, n_head}, 0);
+                             }
+
+                             // dt_bias - shape [n_embd_head_k_kda * n_head] = [4096]
+                             layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT_B, i), {n_embd_head_k_kda * n_head}, 0);
+
+                             // g_a_proj, g_b_proj (output gate)
+                             layer.ssm_g_a = create_tensor(tn(LLM_TENSOR_SSM_G_A, "weight", i), {n_embd, n_embd_head_k_kda}, 0);
+                             layer.ssm_g_b = create_tensor(tn(LLM_TENSOR_SSM_G_B, "weight", i), {n_embd_head_k_kda, n_embd_head_k_kda * n_head}, 0);
+
+                             // o_norm (reusing SSM_NORM)
+                             layer.ssm_o_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {n_embd_head_k_kda}, 0); // FusedRMSNormGated
+                             layer.ssm_o_norm_b = create_tensor(tn(LLM_TENSOR_SSM_NORM, "bias", i), {n_embd_head_k_kda}, TENSOR_NOT_REQUIRED);
+
+                             // o_proj
+                             layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v_kda * n_head, n_embd}, 0);
+
+                        } else {
+                             // MLA Layer - use MLA-specific head dimensions
+                             const int64_t q_lora_rank  = hparams.n_lora_q;
+                             const int64_t kv_lora_rank = hparams.n_lora_kv;
+                             const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla > 0 ? hparams.n_embd_head_k_mla : 192;
+                             const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla > 0 ? hparams.n_embd_head_v_mla : 128;
+
+                             layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, TENSOR_NOT_REQUIRED);
+                             layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
+
+                             if (layer.attn_q_a_norm) {
+                                 layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
+                                 layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
+                             } else {
+                                 // Kimi MLA without Q compression: wq = [n_embd, n_head * n_embd_head_k_mla]
+                                 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
+                             }
+
+                             // Kimi: qk_rope_head_dim = 64 (actual RoPE dimension for MLA)
+                             // Note: hparams.n_rot may be 72 (from conversion) but actual is 64
+                             const int64_t qk_rope_head_dim = 64;  // From config: qk_rope_head_dim
+                             layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + qk_rope_head_dim}, 0);
+                             layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_k_mla - qk_rope_head_dim + n_embd_head_v_mla)}, 0);
+
+                             layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
+                        }
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+                        // MoE intermediate size (different from dense FFN)
+                        const int64_t n_ff_exp = hparams.n_ff_exp > 0 ? hparams.n_ff_exp : 1024;
+
+                        // Kimi uses n_layer_dense_lead to determine which layers use dense FFN vs MoE
+                        // first_k_dense_replace = 1 means layer 0 uses dense FFN, layers 1+ use MoE
+                        if (i < (int) hparams.n_layer_dense_lead) {
+                            // Dense FFN layer - use normal n_ff
+                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
+                        } else {
+                            // MoE layer - use n_ff_exp (1024) instead of n_ff (9216)
+                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
+                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
+                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
+
+                            // Shared experts use moe_intermediate_size * num_shared_experts
+                            // Kimi: shared_expert_intermediate_size = 1024 * 1 = 1024
+                            // Tensors are 2D: [n_embd, n_ff_shexp] or [n_ff_shexp, n_embd]
+                            const int64_t n_ff_shexp_actual = n_ff_exp * (hparams.n_expert_shared > 0 ? hparams.n_expert_shared : 1);
+                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp_actual}, TENSOR_NOT_REQUIRED);
+                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp_actual, n_embd}, TENSOR_NOT_REQUIRED);
+                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_shexp_actual}, TENSOR_NOT_REQUIRED);
+
+                            // exp_probs_b (e_score_correction_bias in vLLM)
+                            // Try "bias" first (standard), then "weight" (for compatibility)
+                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
+                            if (!layer.ffn_exp_probs_b) {
+                                layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "weight", i), {n_expert}, TENSOR_NOT_REQUIRED);
+                            }
+                        }
+                    }
+                } break;
             case LLM_ARCH_COGVLM:
                 {
                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -7563,6 +7753,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
             {
                 llm = std::make_unique<llm_build_mistral3>(*this, params);
             } break;
+        case LLM_ARCH_KIMI_LINEAR:
+            {
+                llm = std::make_unique<llm_build_kimi_linear>(*this, params);
+            } break;
         default:
             GGML_ABORT("fatal error");
     }
@@ -7718,6 +7912,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_ARCTIC:
         case LLM_ARCH_DEEPSEEK:
         case LLM_ARCH_DEEPSEEK2:
+        case LLM_ARCH_KIMI_LINEAR:
         case LLM_ARCH_PLM:
         case LLM_ARCH_CHATGLM:
         case LLM_ARCH_GLM4:
diff --git a/src/llama-model.h b/src/llama-model.h
index f8342cf2cb..b067b686d2 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -84,6 +84,7 @@ enum llm_type {
     LLM_TYPE_35B,
     LLM_TYPE_36B,
     LLM_TYPE_40B,
+    LLM_TYPE_48B,
     LLM_TYPE_65B,
     LLM_TYPE_70B,
     LLM_TYPE_120B,
@@ -404,6 +405,23 @@ struct llama_layer {
     struct ggml_tensor * ffn_act_beta    = nullptr;
     struct ggml_tensor * ffn_act_eps     = nullptr;
 
+    // Kimi Linear KDA (using ssm_ prefix for consistency)
+    // Note: ssm_dt_b already exists above (mamba bias), reused for Kimi dt_bias
+    struct ggml_tensor * ssm_q_conv = nullptr;
+    struct ggml_tensor * ssm_q_conv_b = nullptr;
+    struct ggml_tensor * ssm_k_conv = nullptr;
+    struct ggml_tensor * ssm_k_conv_b = nullptr;
+    struct ggml_tensor * ssm_v_conv = nullptr;
+    struct ggml_tensor * ssm_v_conv_b = nullptr;
+    struct ggml_tensor * ssm_f_a    = nullptr;
+    struct ggml_tensor * ssm_f_b    = nullptr;
+    struct ggml_tensor * ssm_beta   = nullptr;
+    struct ggml_tensor * ssm_a_log  = nullptr;
+    struct ggml_tensor * ssm_g_a    = nullptr;
+    struct ggml_tensor * ssm_g_b    = nullptr;
+    struct ggml_tensor * ssm_o_norm = nullptr;
+    struct ggml_tensor * ssm_o_norm_b = nullptr;
+
     struct llama_layer_posnet posnet;
 
     struct llama_layer_convnext convnext;
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 0b23eaef3a..7b8bf6e524 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -724,7 +724,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
 
     // sanity checks for models that have attention layers
-    if (qs.n_attention_wv != 0 && !is_clip_model)
+    // Skip this check for Kimi models which have hybrid KDA+MLA architecture
+    // (only MLA layers have attn_kv_b weights, KDA layers don't)
+    if (qs.n_attention_wv != 0 && !is_clip_model && model.arch != LLM_ARCH_KIMI_LINEAR)
     {
         const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
         // attention layers have a non-zero number of kv heads
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index a73c4c448b..7af74b0218 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1738,26 +1738,33 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
 
             // read bpe merges and populate bpe ranks
             const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
+            // Kimi-K2 uses custom tokenization without traditional BPE merges
+            const bool is_kimi_k2 = (tokenizer_pre == "kimi-k2");
+            
             if (merges_keyidx == -1) {
-                throw std::runtime_error("cannot find tokenizer merges in model file\n");
-            }
-
-            const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
-            for (int i = 0; i < n_merges; i++) {
-                const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
-                //GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
-
-                std::string first;
-                std::string second;
-
-                const size_t pos = word.find(' ', 1);
-
-                if (pos != std::string::npos) {
-                    first  = word.substr(0, pos);
-                    second = word.substr(pos + 1);
+                if (!is_kimi_k2) {
+                    throw std::runtime_error("cannot find tokenizer merges in model file\n");
                 }
+                // Kimi-K2 doesn't need merges, skip
+                LLAMA_LOG_INFO("%s: Kimi-K2 tokenizer detected, skipping BPE merges\n", __func__);
+            } else {
+                const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
+                for (int i = 0; i < n_merges; i++) {
+                    const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
+                    //GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
 
-                bpe_ranks.emplace(std::make_pair(first, second), i);
+                    std::string first;
+                    std::string second;
+
+                    const size_t pos = word.find(' ', 1);
+
+                    if (pos != std::string::npos) {
+                        first  = word.substr(0, pos);
+                        second = word.substr(pos + 1);
+                    } 
+
+                    bpe_ranks.emplace(std::make_pair(first, second), i);
+                }
             }
 
             // default special tokens

From 139548d07011c27a719c0ab24f79073fe1cd0e1f Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Tue, 2 Dec 2025 12:11:15 +0800
Subject: [PATCH 09/58] remove "const int64_t n_seq_tokens = q->ne[2];" to get
 rid of unused variable warning

---
 ggml/src/ggml.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 8bf562e8b1..1703cad5ab 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1134,7 +1134,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "glu(x)",
 };
 
-static_assert(GGML_OP_COUNT == 95, "GGML_OP_COUNT != 95");
+static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
@@ -5482,7 +5482,6 @@ struct ggml_tensor * ggml_kda_scan(
     // h_new: {head_dim, head_dim, n_head, n_seqs}
     const int64_t head_dim = h->ne[0];
     const int64_t n_head = q->ne[1];
-    const int64_t n_seq_tokens = q->ne[2];
     const int64_t n_seqs = q->ne[3];
     struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 
         ggml_nelements(q) + head_dim * head_dim * n_head * n_seqs);

From 83d328d0d38ce81abb92c77f89b8e1567833ad1f Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Tue, 2 Dec 2025 14:09:02 +0800
Subject: [PATCH 10/58] remove type mismatch warning

---
 src/llama-graph.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index e41d65398f..f877267289 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -1817,7 +1817,7 @@ ggml_tensor * llm_graph_context::build_rs(
 
     // copy extra states which won't be changed further (between n_seqs and n_rs)
     // Skip if there are no extra states to copy (n_rs == n_seqs)
-    if (arch != LLM_ARCH_KIMI_LINEAR || n_rs > n_seqs) { // arch check for backward compat
+    if (arch != LLM_ARCH_KIMI_LINEAR || n_rs > (u_int32_t) n_seqs) { // arch check for backward compat
         ggml_tensor * states_extra = ggml_get_rows(ctx0, states, state_copy_extra);
         ggml_build_forward_expand(gf,
             ggml_cpy(ctx0,

From 772ca88070eb6ae079641adf6e3e8eb289952104 Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Tue, 2 Dec 2025 20:16:24 +0800
Subject: [PATCH 11/58] read MoE params

---
 convert_hf_to_gguf.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index ba21124d6f..2808b72d76 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -5046,6 +5046,17 @@ class KimiLinearModel(TextModel):
              head_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
              self.gguf_writer.add_rope_dimension_count(head_dim)
 
+        self.gguf_writer.add_rope_freq_base(self.hparams.get("rope_theta", 10000.0))
+
+        # MoE params
+        n_experts = self.hparams.get("num_local_experts", self.hparams.get("num_experts"))
+        if n_experts is not None:
+            self.gguf_writer.add_expert_count(n_experts)
+        # Support both num_experts_per_tok and num_experts_per_token
+        n_experts_used = self.hparams.get("num_experts_per_tok", self.hparams.get("num_experts_per_token"))
+        if n_experts_used is not None:
+            self.gguf_writer.add_expert_used_count(n_experts_used)
+
         # moe_intermediate_size (1024 for Kimi)
         moe_intermediate_size = self.hparams.get("moe_intermediate_size")
         if moe_intermediate_size is not None:

From 9f1265fec16598cc9c24ae31ae38c3ae7aaa3bde Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Fri, 5 Dec 2025 19:51:02 +0800
Subject: [PATCH 12/58] removed some hard coded code

---
 convert_hf_to_gguf.py      | 25 +++++++++++++++--
 src/llama-model.cpp        | 21 +++-----------
 src/models/kimi-linear.cpp | 57 +++++++++++++++++++++++---------------
 3 files changed, 61 insertions(+), 42 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 2808b72d76..9c36c84189 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -4990,7 +4990,9 @@ class KimiLinearModel(TextModel):
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
         self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
-        
+        self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"])
+        self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
+ 
         # Use find_hparam for context length
         # Kimi uses model_max_length
         n_ctx = self.find_hparam(["max_position_embeddings", "model_max_length", "n_ctx", "n_positions"], optional=True)
@@ -5004,6 +5006,18 @@ class KimiLinearModel(TextModel):
         # KDA & MLA params
         # Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv
         linear_attn_config = self.hparams.get("linear_attn_config", {})
+        # n_head == 0 for KDA layers, n_head > 0 for MLA layers
+        # full_attention_layers list will be used to distingush layer type
+        _num_kv_heads = list()
+        _full_attn_layers = linear_attn_config["full_attn_layers"]
+        for il in range(self.hparams["num_hidden_layers"]):
+            if il+1 in _full_attn_layers:
+                _num_kv_heads.append(linear_attn_config["num_heads"])
+            else:
+                _num_kv_heads.append(0)
+        assert(len(_num_kv_heads) == self.hparams["num_hidden_layers"])
+        self.gguf_writer.add_head_count_kv(_num_kv_heads)
+
         ssm_d_conv = self.hparams.get("ssm_d_conv") or linear_attn_config.get("short_conv_kernel_size")
         if ssm_d_conv is not None:
              self.gguf_writer.add_ssm_conv_kernel(ssm_d_conv)
@@ -5046,7 +5060,14 @@ class KimiLinearModel(TextModel):
              head_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
              self.gguf_writer.add_rope_dimension_count(head_dim)
 
-        self.gguf_writer.add_rope_freq_base(self.hparams.get("rope_theta", 10000.0))
+        # Copied from Qwen2Moe as this model inherits parts of it
+        # YaRN is not enabled by default
+        # To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
+        rope_scaling = self.hparams.get("rope_scaling") or {}
+        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
+            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
+            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
+            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
 
         # MoE params
         n_experts = self.hparams.get("num_local_experts", self.hparams.get("num_experts"))
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 763f0dfecb..0f162cdd7a 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -120,6 +120,7 @@ const char * llm_type_name(llm_type type) {
         case LLM_TYPE_16B_A1B:       return "16B.A1B";
         case LLM_TYPE_21B_A3B:       return "21B.A3B";
         case LLM_TYPE_30B_A3B:       return "30B.A3B";
+        case LLM_TYPE_48B_A3B:       return "48B.A3B";
         case LLM_TYPE_100B_A6B:      return "100B.A6B";
         case LLM_TYPE_106B_A12B:     return "106B.A12B";
         case LLM_TYPE_230B_A10B:     return "230B.A10B";
@@ -2299,13 +2300,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 // qk_rope_head_dim = 64, qk_nope_head_dim = 128, qk_head_dim = 192
 
                 // Mark KDA layers as recurrent using n_head_kv pattern (like Jamba)
-                // MLA layers are at: 3, 7, 11, 15, 19, 23, 26 (7 MLA layers total)
-                // KDA layers are all others: 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25 (20 KDA layers)
                 // Set n_head_kv = 0 for KDA layers (recurrent), n_head_kv = n_head for MLA layers (attention)
                 for (uint32_t i = 0; i < hparams.n_layer; ++i) {
-                    bool is_mla = (i == 3 || i == 7 || i == 11 || i == 15 || i == 19 || i == 23 || i == 26);
-                    hparams.n_head_kv_arr[i] = is_mla ? hparams.n_head() : 0;
-                    hparams.recurrent_layer_arr[i] = !is_mla;  // KDA layers are recurrent
+                    hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;  // KDA layers are recurrent
                 }
 
                 // MoE parameters - Kimi uses moe_intermediate_size = 1024
@@ -2316,18 +2313,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
                 ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func, false);
 
-                // Default values if not in GGUF
-                if (hparams.n_ff_exp == 0) hparams.n_ff_exp = 1024;  // moe_intermediate_size
-                if (hparams.n_ff_shexp == 0) hparams.n_ff_shexp = 9216;  // shared_expert_intermediate_size = intermediate_size
-                if (hparams.n_expert_shared == 0) hparams.n_expert_shared = 1;  // num_shared_experts
-                if (hparams.n_layer_dense_lead == 0) hparams.n_layer_dense_lead = 1;  // first_k_dense_replace
-                if (hparams.expert_weights_scale == 0.0f) hparams.expert_weights_scale = 2.446f;  // routed_scaling_factor
-
-                // MoE gating function - Kimi uses sigmoid (moe_router_activation_func: sigmoid)
-                if (hparams.expert_gating_func == 0) hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
-
                 switch (hparams.n_layer) {
-                    case 27: type = LLM_TYPE_48B; break; // Kimi-Linear-48B-A3B
+                    case 27: type = LLM_TYPE_48B_A3B; break; // Kimi-Linear-48B-A3B
                     default: type = LLM_TYPE_UNKNOWN;
                 }
             } break;
@@ -7894,6 +7881,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_ARWKV7:
         case LLM_ARCH_WAVTOKENIZER_DEC:
         case LLM_ARCH_NEMOTRON_H:
+        case LLM_ARCH_KIMI_LINEAR:
             return LLAMA_ROPE_TYPE_NONE;
 
         // use what we call a normal RoPE, operating on pairs of consecutive head values
@@ -7912,7 +7900,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_ARCTIC:
         case LLM_ARCH_DEEPSEEK:
         case LLM_ARCH_DEEPSEEK2:
-        case LLM_ARCH_KIMI_LINEAR:
         case LLM_ARCH_PLM:
         case LLM_ARCH_CHATGLM:
         case LLM_ARCH_GLM4:
diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp
index 660cd06f0e..40fbe469b3 100644
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@@ -339,6 +339,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
                 ggml_row_size(kv->type, n_embd_head_qk_nope));
             k_nope = ggml_cont(ctx0, k_nope);
             Vcur = ggml_cont(ctx0, Vcur);
+            cb(Vcur, "mla_V", il);
             
             // Concatenate k_nope + k_pe (broadcast k_pe to all heads)
             // K = [k_nope, k_pe] where k_nope is [qk_nope_head_dim, n_head, n_tokens]
@@ -349,12 +350,11 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
             ggml_tensor * k_pe_repeated = ggml_repeat(ctx0, k_pe, k_pe_target);
             ggml_tensor * Kcur = ggml_concat(ctx0, k_nope, k_pe_repeated, 0);
             cb(Kcur, "mla_K", il);
-            cb(Vcur, "mla_V", il);
             
             // Direct softmax attention (without KV cache)
             // Use build_attn with inp_no_cache for proper mask handling
-            cur = build_attn(inp_no_cache, layer.wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il);
-            cb(cur, "mla_out", il);
+            cur = build_attn(inp_no_cache, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il);
+//            cb(cur, "mla_out", il);
             
         } else {
             // Unknown layer type - this should not happen
@@ -375,18 +375,33 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
         cur = build_norm(ffn_inp, layer.ffn_norm, NULL, LLM_NORM_RMS, il);
         cb(cur, "ffn_norm", il);
 
-        // FFN / MoE
-        if (layer.ffn_gate_inp) {
+        if ((uint32_t) il < hparams.n_layer_dense_lead) {
+            // Dense FFN layer
+            cur = build_ffn(cur,
+                layer.ffn_up, NULL, NULL,
+                layer.ffn_gate, NULL, NULL,
+                layer.ffn_down, NULL, NULL,
+                NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+            cb(cur, "ffn_out", il);
+        } else {
             // MoE layer
             // Kimi uses moe_renormalize=True and routed_scaling_factor (stored as expert_weights_scale) = 2.446
-            ggml_tensor * moe_out = build_moe_ffn(cur, layer.ffn_gate_inp, layer.ffn_up_exps, layer.ffn_gate_exps, layer.ffn_down_exps, 
-                                layer.ffn_exp_probs_b, hparams.n_expert, hparams.n_expert_used, 
-                                LLM_FFN_SILU, true, true, hparams.expert_weights_scale,
-                                (llama_expert_gating_func_type) hparams.expert_gating_func, il);
+            ggml_tensor * moe_out = build_moe_ffn(cur,
+                layer.ffn_gate_inp,
+                layer.ffn_up_exps,
+                layer.ffn_gate_exps,
+                layer.ffn_down_exps,
+                layer.ffn_exp_probs_b,
+                hparams.n_expert,
+                hparams.n_expert_used,
+                LLM_FFN_SILU, true,
+                true, hparams.expert_weights_scale,
+                (llama_expert_gating_func_type) hparams.expert_gating_func,
+                il);
             cb(moe_out, "ffn_moe_out", il);
             
-            // Shared expert (if present)
-            if (layer.ffn_gate_shexp) {
+            // Shared expert
+            {
                 ggml_tensor * ffn_shexp = build_ffn(cur,
                         layer.ffn_up_shexp, NULL, NULL,
                         layer.ffn_gate_shexp, NULL, NULL,
@@ -396,27 +411,23 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
                 
                 cur = ggml_add(ctx0, moe_out, ffn_shexp);
                 cb(cur, "ffn_out", il);
-            } else {
-                cur = moe_out;
             }
-        } else if (layer.ffn_gate) {
-            // Dense FFN layer
-            cur = build_ffn(cur, layer.ffn_up, NULL, NULL, layer.ffn_gate, NULL, NULL, 
-                           layer.ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
-            cb(cur, "ffn_out", il);
-        } else {
-            // No FFN - this should not happen in Kimi
-            GGML_ABORT("Kimi layer missing FFN tensors");
         }
-
         // Residual
         cur = ggml_add(ctx0, cur, ffn_inp);
+
+        cur = build_cvec(cur, il);
+        cb(cur, "l_out", il);
+
         inpL = cur;
     }
+    cur = inpL;
 
     // Final Norm
-    cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1);
+    cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
     cb(cur, "result_norm", -1);
+    res->t_embd = cur;
 
     // Output
     cur = ggml_mul_mat(ctx0, model.output, cur);

From a0269af2928ff17468f54ee837693d20f3d0647d Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Sat, 6 Dec 2025 11:51:16 +0800
Subject: [PATCH 13/58] removed all hard code

---
 convert_hf_to_gguf.py       | 10 +++++++++-
 gguf-py/gguf/constants.py   |  6 ++++++
 gguf-py/gguf/gguf_writer.py |  3 +++
 src/llama-arch.cpp          |  2 ++
 src/llama-arch.h            |  2 ++
 src/llama-hparams.cpp       |  2 +-
 src/llama-hparams.h         |  5 ++---
 src/llama-model.cpp         | 20 +++++++++-----------
 src/llama-model.h           |  2 +-
 src/models/kimi-linear.cpp  | 14 +++++++-------
 10 files changed, 42 insertions(+), 24 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 9c36c84189..45538fcabb 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -5021,6 +5021,13 @@ class KimiLinearModel(TextModel):
         ssm_d_conv = self.hparams.get("ssm_d_conv") or linear_attn_config.get("short_conv_kernel_size")
         if ssm_d_conv is not None:
              self.gguf_writer.add_ssm_conv_kernel(ssm_d_conv)
+
+        kda_head_dim = self.hparams.get("kda_head_dim") or linear_attn_config.get("head_dim")
+
+        if kda_head_dim is not None:
+             self.gguf_writer.add_kda_head_dim(kda_head_dim)
+        
+        # MLA params - use add_* methods that handle arch substitution
         
         # MLA params - use add_* methods that handle arch substitution
         # Support both HuggingFace naming (q_lora_rank, kv_lora_rank) and internal naming (n_lora_q, n_lora_kv)
@@ -5035,8 +5042,9 @@ class KimiLinearModel(TextModel):
         # MLA head dimensions
         # Support HuggingFace naming: qk_nope_head_dim, qk_rope_head_dim, v_head_dim
         qk_nope_head_dim = self.hparams.get("qk_nope_head_dim")
-        qk_rope_head_dim = self.hparams.get("qk_rope_head_dim", self.hparams.get("n_rot"))
+        qk_rope_head_dim = self.hparams.get("qk_rope_head_dim")
         v_head_dim = self.hparams.get("v_head_dim")
+        self.gguf_writer.add_rope_dimension_count(self.hparams["qk_rope_head_dim"])
         
         # Calculate n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim
         if "n_embd_head_k_mla" in self.hparams:
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 485c41abfb..fe9785918b 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -205,6 +205,9 @@ class Keys:
         GROUP_COUNT    = "{arch}.ssm.group_count"
         DT_B_C_RMS     = "{arch}.ssm.dt_b_c_rms"
 
+    class KDA:
+        HEAD_DIM = "{arch}.kda.head_dim"
+
     class WKV:
         HEAD_SIZE = "{arch}.wkv.head_size"
 
@@ -3475,6 +3478,9 @@ KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK
 KEY_SSM_GROUP_COUNT    = Keys.SSM.GROUP_COUNT
 KEY_SSM_DT_B_C_RMS     = Keys.SSM.DT_B_C_RMS
 
+# KDA
+KEY_KDA_HEAD_DIM       = Keys.KDA.HEAD_DIM
+
 # tokenization
 KEY_TOKENIZER_MODEL      = Keys.Tokenizer.MODEL
 KEY_TOKENIZER_PRE        = Keys.Tokenizer.PRE
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 9e6ff3ac77..3b2dfef479 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -970,6 +970,9 @@ class GGUFWriter:
     def add_ssm_dt_b_c_rms(self, value: bool) -> None:
         self.add_bool(Keys.SSM.DT_B_C_RMS.format(arch=self.arch), value)
 
+    def add_kda_head_dim(self, value: int) -> None:
+        self.add_uint32(Keys.KDA.HEAD_DIM.format(arch=self.arch), value)
+
     def add_tokenizer_model(self, model: str) -> None:
         self.add_string(Keys.Tokenizer.MODEL, model)
 
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index ab09bb7eb7..6aabdb7f7d 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -236,6 +236,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_SSM_GROUP_COUNT,    "%s.ssm.group_count"    },
     { LLM_KV_SSM_DT_B_C_RMS,     "%s.ssm.dt_b_c_rms"     },
 
+    { LLM_KV_KDA_HEAD_DIM, "%s.kda.head_dim" },
+
     { LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" },
 
     { LLM_KV_POSNET_EMBEDDING_LENGTH, "%s.posnet.embedding_length" },
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 2b965850c5..d68af214a7 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -240,6 +240,8 @@ enum llm_kv {
     LLM_KV_SSM_GROUP_COUNT,
     LLM_KV_SSM_DT_B_C_RMS,
 
+    LLM_KV_KDA_HEAD_DIM,
+
     LLM_KV_WKV_HEAD_SIZE,
 
     LLM_KV_TOKENIZER_MODEL,
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index 88d266b8da..75ddeeba09 100644
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -137,7 +137,7 @@ uint32_t llama_hparams::n_embd_r() const {
         // for Kimi KDA layers
         // Conv state for Q, K, V: 3 * (d_conv - 1) * n_head * head_dim
         const uint32_t d_inner = n_head() * kda_head_dim;  // 32 * 128 = 4096
-        return 3 * (kda_d_conv > 0 ? kda_d_conv - 1 : 3) * d_inner;
+        return 3 * (ssm_d_conv > 0 ? ssm_d_conv - 1 : 3) * d_inner;
     }
 
     // TODO: maybe support other convolution strides than 1
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 80170650eb..c90ed12b90 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -133,9 +133,8 @@ struct llama_hparams {
     uint32_t ssm_dt_rank = 0;
     uint32_t ssm_n_group = 0;
 
-    // for Kimi Delta Attention (KDA)
-    uint32_t kda_head_dim  = 0;  // head_dim for KDA layers (128 for Kimi)
-    uint32_t kda_d_conv    = 0;  // conv kernel size for KDA (4 for Kimi)
+    // for Kimi Linear KDA
+    uint32_t kda_head_dim = 0;
 
     // for hybrid state space models
     std::array<bool, LLAMA_MAX_LAYERS> recurrent_layer_arr;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 0f162cdd7a..2e3cb9d78c 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2291,10 +2291,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA,  hparams.n_embd_head_v_mla, false);
                 ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK,      hparams.n_lora_kv, false);
                 ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT,        hparams.n_rot, false);
-
-                // KDA (Delta Attention) parameters
-                hparams.kda_head_dim = 128;  // linear_attn_config.head_dim
-                hparams.kda_d_conv = 4;      // linear_attn_config.short_conv_kernel_size
+                ml.get_key(LLM_KV_SSM_CONV_KERNEL,             hparams.ssm_d_conv, false);
+                ml.get_key(LLM_KV_KDA_HEAD_DIM,                hparams.kda_head_dim, false);
 
                 // MLA qk_rope_head_dim (for reference)
                 // qk_rope_head_dim = 64, qk_nope_head_dim = 128, qk_head_dim = 192
@@ -6447,9 +6445,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         // Assuming KDA layer if KDA tensors are present
 
                         // KDA uses head_dim = 128 (from linear_attn_config.head_dim)
-                        const int64_t n_embd_head_k_kda = 128;
-                        const int64_t n_embd_head_v_kda = 128;
-                        const int64_t ssm_d_conv = hparams.ssm_d_conv > 0 ? hparams.ssm_d_conv : 4;
+                        const int64_t n_embd_head_k_kda = hparams.kda_head_dim;
+                        const int64_t n_embd_head_v_kda = hparams.kda_head_dim;
+                        const int64_t ssm_d_conv = hparams.ssm_d_conv;
 
                         // Try loading KDA specific tensors (using SSM_ prefix)
                         // Conv1d weights: try 4D first, then 3D (quantization may remove trailing 1)
@@ -6513,8 +6511,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                              // MLA Layer - use MLA-specific head dimensions
                              const int64_t q_lora_rank  = hparams.n_lora_q;
                              const int64_t kv_lora_rank = hparams.n_lora_kv;
-                             const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla > 0 ? hparams.n_embd_head_k_mla : 192;
-                             const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla > 0 ? hparams.n_embd_head_v_mla : 128;
+                             const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla;
+                             const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla;
 
                              layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, TENSOR_NOT_REQUIRED);
                              layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
@@ -6529,7 +6527,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
                              // Kimi: qk_rope_head_dim = 64 (actual RoPE dimension for MLA)
                              // Note: hparams.n_rot may be 72 (from conversion) but actual is 64
-                             const int64_t qk_rope_head_dim = 64;  // From config: qk_rope_head_dim
+                             const int64_t qk_rope_head_dim = hparams.n_rot;  // From config: qk_rope_head_dim
                              layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + qk_rope_head_dim}, 0);
                              layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_k_mla - qk_rope_head_dim + n_embd_head_v_mla)}, 0);
 
@@ -6539,7 +6537,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 
                         // MoE intermediate size (different from dense FFN)
-                        const int64_t n_ff_exp = hparams.n_ff_exp > 0 ? hparams.n_ff_exp : 1024;
+                        const int64_t n_ff_exp = hparams.n_ff_exp;
 
                         // Kimi uses n_layer_dense_lead to determine which layers use dense FFN vs MoE
                         // first_k_dense_replace = 1 means layer 0 uses dense FFN, layers 1+ use MoE
diff --git a/src/llama-model.h b/src/llama-model.h
index b067b686d2..7081423588 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -84,7 +84,6 @@ enum llm_type {
     LLM_TYPE_35B,
     LLM_TYPE_36B,
     LLM_TYPE_40B,
-    LLM_TYPE_48B,
     LLM_TYPE_65B,
     LLM_TYPE_70B,
     LLM_TYPE_120B,
@@ -114,6 +113,7 @@ enum llm_type {
     LLM_TYPE_16B_A1B,
     LLM_TYPE_21B_A3B, // Ernie MoE small
     LLM_TYPE_30B_A3B,
+    LLM_TYPE_48B_A3B, // Kimi Linear
     LLM_TYPE_80B_A3B, // Qwen3 Next
     LLM_TYPE_100B_A6B,
     LLM_TYPE_106B_A12B, // GLM-4.5-Air
diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp
index 40fbe469b3..d025eab5f3 100644
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@@ -21,8 +21,8 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
 
     // Kimi dimension constants
     const int64_t n_head = hparams.n_head();
-    const int64_t head_dim = hparams.kda_head_dim > 0 ? hparams.kda_head_dim : 128;
-    const int64_t d_conv = hparams.kda_d_conv > 0 ? hparams.kda_d_conv : 4;
+    const int64_t head_dim = hparams.kda_head_dim;
+    const int64_t d_conv = hparams.ssm_d_conv;
     const int64_t d_inner = n_head * head_dim;  // 32 * 128 = 4096
     const int64_t n_seqs = ubatch.n_seqs;
     const int64_t n_seq_tokens = ubatch.n_seq_tokens;
@@ -33,12 +33,12 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
     GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
     
     // MLA params
-    const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla > 0 ? hparams.n_embd_head_k_mla : 192;
-    const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla > 0 ? hparams.n_embd_head_v_mla : 128;
-    const int64_t kv_lora_rank = hparams.n_lora_kv > 0 ? hparams.n_lora_kv : 512;
-    // qk_rope_head_dim = 64 (from Kimi config), NOT hparams.n_rot (which is 72)
+    const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla;
+    const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla;
+    const int64_t kv_lora_rank = hparams.n_lora_kv;
+    // qk_rope_head_dim = 64 (from Kimi config) which is hparams.n_rot
     // Confirmed from tensor shape: wkv_a_mqa [2304, 576] = [n_embd, kv_lora_rank + qk_rope_head_dim]
-    const int64_t n_embd_head_qk_rope = 64;  // config.qk_rope_head_dim
+    const int64_t n_embd_head_qk_rope = hparams.n_rot;  // config.qk_rope_head_dim
     const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;  // 192 - 64 = 128
     
     // Attention scale for KDA (1/sqrt(head_dim))

From ef5bc3054484c4696ead7289395e617893086050 Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Sun, 14 Dec 2025 17:43:30 +0800
Subject: [PATCH 14/58] use DeepseekV2 tokenizer

---
 convert_hf_to_gguf.py | 104 +++++++++++++++++++-----------------------
 1 file changed, 46 insertions(+), 58 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 45538fcabb..ac353c7dda 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -5107,71 +5107,59 @@ class KimiLinearModel(TextModel):
             self.gguf_writer.add_expert_weights_scale(routed_scaling_factor)
 
     def set_vocab(self):
-        # Kimi uses TikToken tokenizer - load via transformers
+        try:
+            self._set_vocab_gpt2()
+            return
+        except Exception:
+            pass
+
         from transformers import AutoTokenizer
-        
-        dir_model = self.dir_model
-        vocab_size = self.hparams["vocab_size"]
-        
-        logger.info(f"Loading TikToken tokenizer from {dir_model}")
-        tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
-        
-        tokens: list[str] = []
-        toktypes: list[int] = []
-        
-        # Get tokenizer pre string
+        tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
         tokpre = self.get_vocab_base_pre(tokenizer)
-        
-        # Build vocab from tokenizer
-        merges = []
-        vocab = {}
-        
-        # TikToken stores vocab in mergeable_ranks
-        if hasattr(tokenizer, 'mergeable_ranks'):
-            mergeable_ranks = tokenizer.mergeable_ranks
+
+        if tokpre == "kimi-k2":
+            # Build merges list using the approach similar to HunYuanMoE
+            merges = []
+            vocab = {}
+            mergeable_ranks = tokenizer.model._mergeable_ranks
             for token, rank in mergeable_ranks.items():
-                vocab[self._token_bytes_to_string(token)] = rank
+                vocab[QwenModel.token_bytes_to_string(token)] = rank
                 if len(token) == 1:
                     continue
-                # Build merges
-                merged = self._bpe(mergeable_ranks, token, max_rank=rank)
+                merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
                 if len(merged) == 2:
-                    merges.append(' '.join(map(self._token_bytes_to_string, merged)))
+                    merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
+            
+            # Build token list
+            vocab_size = self.hparams["vocab_size"] 
+            special_tokens = tokenizer.special_tokens
+            reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}    
+            tokens: list[str] = []
+            toktypes: list[int] = []
+
+            for i in range(vocab_size):
+                if i not in reverse_vocab:
+                    tokens.append(f"[PAD{i}]")
+                    toktypes.append(gguf.TokenType.UNUSED)
+                else:
+                    token = reverse_vocab[i]
+                    tokens.append(token)
+                    if i in special_tokens.values():
+                        toktypes.append(gguf.TokenType.CONTROL)
+                    else:
+                        toktypes.append(gguf.TokenType.NORMAL)
+
+            self.gguf_writer.add_tokenizer_model("gpt2")
+            self.gguf_writer.add_tokenizer_pre(tokpre)
+            self.gguf_writer.add_token_list(tokens)
+            self.gguf_writer.add_token_types(toktypes)
+            self.gguf_writer.add_token_merges(merges)
+
+            special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
+            special_vocab.add_to_gguf(self.gguf_writer)
         else:
-            # Fallback: get vocab directly
-            vocab = {tok: idx for tok, idx in tokenizer.get_vocab().items()}
-        
-        # Get special tokens
-        added_vocab = {}
-        if hasattr(tokenizer, 'special_tokens'):
-            added_vocab = tokenizer.special_tokens
-        elif hasattr(tokenizer, 'added_tokens_encoder'):
-            added_vocab = tokenizer.added_tokens_encoder
-        
-        # Combine vocab
-        reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
-        
-        for i in range(vocab_size):
-            if i not in reverse_vocab:
-                tokens.append(f"[PAD{i}]")
-                toktypes.append(gguf.TokenType.UNUSED)
-            elif i in added_vocab.values() if added_vocab else False:
-                tokens.append(reverse_vocab[i])
-                toktypes.append(gguf.TokenType.CONTROL)
-            else:
-                tokens.append(reverse_vocab[i])
-                toktypes.append(gguf.TokenType.NORMAL)
-        
-        self.gguf_writer.add_tokenizer_model("gpt2")
-        self.gguf_writer.add_tokenizer_pre(tokpre)
-        self.gguf_writer.add_token_list(tokens)
-        self.gguf_writer.add_token_types(toktypes)
-        
-        special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
-        special_vocab.merges = merges
-        special_vocab.add_to_gguf(self.gguf_writer)
-        logger.info(f"Loaded {len(tokens)} tokens, {len(merges)} merges")
-    
+            raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!")
+
     @staticmethod
     def _token_bytes_to_string(b: bytes) -> str:
         """Convert bytes to string representation for tokenizer"""

From ae9771d1dca927ab4b6e37510d4dd36cd40fe77a Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Thu, 18 Dec 2025 08:14:15 +0800
Subject: [PATCH 15/58] removed unnecessary internal methods called by the old
 set_vocab of KimiLinear

---
 convert_hf_to_gguf.py | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index ac353c7dda..c6724f2ed5 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -5160,28 +5160,6 @@ class KimiLinearModel(TextModel):
         else:
             raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!")
 
-    @staticmethod
-    def _token_bytes_to_string(b: bytes) -> str:
-        """Convert bytes to string representation for tokenizer"""
-        return ''.join([chr(byte) if byte < 128 else f'<0x{byte:02X}>' for byte in b])
-    
-    @staticmethod  
-    def _bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
-        """BPE tokenization for merges extraction"""
-        parts = [bytes([b]) for b in token]
-        while True:
-            min_idx = None
-            min_rank = None
-            for i, pair in enumerate(zip(parts[:-1], parts[1:])):
-                rank = mergeable_ranks.get(pair[0] + pair[1])
-                if rank is not None and (min_rank is None or rank < min_rank):
-                    min_idx = i
-                    min_rank = rank
-            if min_rank is None or (max_rank is not None and min_rank >= max_rank):
-                break
-            parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
-        return parts
-
     def prepare_tensors(self):
         super().prepare_tensors()
         if self._experts is not None:

From f9a11d7758924a3ab2bf59eab97f4379ef7086d7 Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Thu, 18 Dec 2025 20:46:10 +0800
Subject: [PATCH 16/58] rewrite get_vocab for KimiLinear. Removed all kda_scan
 code

---
 ggml/src/ggml-cpu/ggml-cpu.c    |   4 -
 ggml/src/ggml-cpu/ops.cpp       | 187 ----------------------------
 ggml/src/ggml-cpu/ops.h         |   1 -
 ggml/src/ggml-cuda/ggml-cuda.cu |   4 -
 ggml/src/ggml-cuda/kda-scan.cu  | 209 --------------------------------
 ggml/src/ggml-cuda/kda-scan.cuh |   3 -
 ggml/src/ggml.c                 |  63 ----------
 7 files changed, 471 deletions(-)
 delete mode 100644 ggml/src/ggml-cuda/kda-scan.cu
 delete mode 100644 ggml/src/ggml-cuda/kda-scan.cuh

diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 7b40f1e8c2..4cc15b0981 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1962,10 +1962,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_ssm_scan(params, tensor);
             } break;
-        case GGML_OP_KDA_SCAN:
-            {
-                ggml_compute_forward_kda_scan(params, tensor);
-            } break;
         case GGML_OP_WIN_PART:
             {
                 ggml_compute_forward_win_part(params, tensor);
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 9c93e0c101..a5ecd35e34 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -8686,7 +8686,6 @@ static void ggml_compute_forward_ssm_conv_f32(
     const int ir1 = MIN(ir0 + dr, nr);
     const int ir  = ir1 - ir0;
 
-    static int conv_debug_count = 0;
     bool do_conv_debug = false; // (ith == 0 && conv_debug_count++ < 3);
 
     for (int i3 = 0; i3 < n_s; ++i3) {
@@ -8966,192 +8965,6 @@ void ggml_compute_forward_ssm_scan(
     }
 }
 
-// ggml_compute_forward_kda_scan
-// KDA (Kimi Delta Attention) recurrence:
-//   h[t] = exp(g[t]) * h[t-1] + k[t]^T * (beta[t] * (v[t] - h[t-1] @ k[t]))
-//   o[t] = q[t]^T @ h[t]
-
-static void ggml_compute_forward_kda_scan_f32(
-        const ggml_compute_params * params,
-        ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0]; // h    {head_dim, head_dim, n_head, n_seqs+}
-    const ggml_tensor * src1 = dst->src[1]; // q    {head_dim, n_head, n_seq_tokens, n_seqs}
-    const ggml_tensor * src2 = dst->src[2]; // k    {head_dim, n_head, n_seq_tokens, n_seqs}
-    const ggml_tensor * src3 = dst->src[3]; // v    {head_dim, n_head, n_seq_tokens, n_seqs}
-    const ggml_tensor * src4 = dst->src[4]; // g    {head_dim, n_head, n_seq_tokens, n_seqs}
-    const ggml_tensor * src5 = dst->src[5]; // beta {n_head, n_seq_tokens, n_seqs}
-    const ggml_tensor * src6 = dst->src[6]; // ids  {n_seqs}
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t head_dim     = src0->ne[0];
-    const int64_t n_head       = src1->ne[1];
-    const int64_t n_seq_tokens = src1->ne[2];
-    const int64_t n_seqs       = src1->ne[3];
-
-    // Output offset for hidden state
-    const int64_t y_off = ggml_nelements(src1) * sizeof(float);
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-    GGML_ASSERT(src1->nb[0] == sizeof(float));
-    GGML_ASSERT(src2->nb[0] == sizeof(float));
-    GGML_ASSERT(src3->nb[0] == sizeof(float));
-    GGML_ASSERT(src4->nb[0] == sizeof(float));
-    GGML_ASSERT(src5->nb[0] == sizeof(float));
-    GGML_ASSERT(src6->nb[0] == sizeof(int32_t));
-
-    // Parallelize over heads
-    const int dh = (n_head + nth - 1) / nth;
-    const int ih0 = dh * ith;
-    const int ih1 = MIN(ih0 + dh, (int)n_head);
-
-    const int32_t * ids = (const int32_t *) src6->data;
-
-    // Temporary buffer for h @ k computation
-    float * hk_buf = (float *) malloc(head_dim * sizeof(float));
-
-    static int debug_count = 0;
-    bool do_debug = false; // (ith == 0 && debug_count++ < 20);
-
-    for (int i3 = 0; i3 < n_seqs; ++i3) {
-        // Get initial hidden state for this sequence
-        const float * h0 = (const float *) ((const char *) src0->data + ids[i3] * src0->nb[3]);
-        // Output hidden state location
-        float * h_out = (float *) ((char *) dst->data + i3 * src0->nb[3] + y_off);
-
-        for (int ih = ih0; ih < ih1; ++ih) {
-            // Per-head hidden state: [head_dim, head_dim]
-            // Copy initial state to output (will be updated in place)
-            const float * h_in = h0 + ih * head_dim * head_dim;
-            float * h = h_out + ih * head_dim * head_dim;
-
-            // Copy initial state, but check for invalid values and clear if needed
-            bool need_clear = false;
-            for (int i = 0; i < head_dim * head_dim && !need_clear; ++i) {
-                if (!isfinite(h_in[i]) || fabsf(h_in[i]) > 1e6f) {
-                    need_clear = true;
-                }
-            }
-            for (int i = 0; i < head_dim * head_dim; ++i) {
-                h[i] = need_clear ? 0.0f : h_in[i];
-            }
-
-            for (int it = 0; it < n_seq_tokens; ++it) {
-                const float * q_raw = (const float *) ((const char *) src1->data + 
-                    it * src1->nb[2] + i3 * src1->nb[3]) + ih * head_dim;
-                const float * k_raw = (const float *) ((const char *) src2->data + 
-                    it * src2->nb[2] + i3 * src2->nb[3]) + ih * head_dim;
-                const float * v = (const float *) ((const char *) src3->data + 
-                    it * src3->nb[2] + i3 * src3->nb[3]) + ih * head_dim;
-                const float * g = (const float *) ((const char *) src4->data + 
-                    it * src4->nb[2] + i3 * src4->nb[3]) + ih * head_dim;
-                const float beta = ((const float *) ((const char *) src5->data + 
-                    it * src5->nb[1] + i3 * src5->nb[2]))[ih];
-
-                float * y = (float *) dst->data + 
-                    it * n_head * head_dim + i3 * n_seq_tokens * n_head * head_dim + ih * head_dim;
-
-                // L2 normalize q and k (critical for KDA stability)
-                float q_norm = 0.0f, k_norm = 0.0f;
-                for (int i = 0; i < head_dim; ++i) {
-                    q_norm += q_raw[i] * q_raw[i];
-                    k_norm += k_raw[i] * k_raw[i];
-                }
-                q_norm = sqrtf(q_norm + 1e-6f);
-                k_norm = sqrtf(k_norm + 1e-6f);
-
-                // Debug output
-                if (do_debug && ih == 0 && it == 0 && i3 == 0) {
-                    fprintf(stderr, "DEBUG KDA: q_raw[0]=%f, k_raw[0]=%f, v[0]=%f, g[0]=%f, beta=%f\n",
-                            q_raw[0], k_raw[0], v[0], g[0], beta);
-                    fprintf(stderr, "DEBUG KDA: q_norm=%f, k_norm=%f, exp(g[0])=%f, scale=%f\n",
-                            q_norm, k_norm, expf(g[0]), 1.0f / sqrtf((float)head_dim));
-                }
-
-                // Normalized q and k with scale = 1/sqrt(head_dim)
-                // Note: scale is applied only to q after L2 normalization
-                const float scale = 1.0f / sqrtf((float)head_dim);
-                float q[128], k[128];  // assume head_dim <= 128
-                for (int i = 0; i < head_dim; ++i) {
-                    // L2 normalize then scale q
-                    q[i] = (q_raw[i] / q_norm) * scale;
-                    // L2 normalize k (no scale)
-                    k[i] = k_raw[i] / k_norm;
-                }
-
-                // KDA recurrence: h[t] = exp(g[t]) * h[t-1] + k[t]^T * (beta[t] * (v[t] - h[t-1] @ k[t]))
-                // Note: Apply decay first, then compute retrieval and update
-
-                // Step 1: Apply decay to h first: h = h * exp(g)
-                for (int i = 0; i < head_dim; ++i) {
-                    const float exp_gi = expf(g[i]);
-                    for (int j = 0; j < head_dim; ++j) {
-                        h[i * head_dim + j] *= exp_gi;
-                    }
-                }
-
-                // Step 2: Compute h^T @ k -> hk_buf [head_dim]
-                // hk_buf[j] = sum_i (h[i,j] * k[i]) which is column j of h dotted with k
-                for (int j = 0; j < head_dim; ++j) {
-                    float sum = 0.0f;
-                    for (int i = 0; i < head_dim; ++i) {
-                        sum += h[i * head_dim + j] * k[i];
-                    }
-                    hk_buf[j] = sum;
-                }
-
-                // Step 3: Compute delta = beta * (v - hk) and update h
-                // h = h + outer(k, delta) where outer(k,delta)[i,j] = k[i] * delta[j]
-                for (int i = 0; i < head_dim; ++i) {
-                    for (int j = 0; j < head_dim; ++j) {
-                        const float delta_j = beta * (v[j] - hk_buf[j]);
-                        h[i * head_dim + j] += k[i] * delta_j;
-                    }
-                }
-
-                // Step 4: Compute output y = h^T @ q -> [head_dim]
-                // vLLM: b_o = tl.sum(b_h * b_q[:, None], 0) means o[j] = sum_i(h[i,j] * q[i])
-                for (int j = 0; j < head_dim; ++j) {
-                    float sum = 0.0f;
-                    for (int i = 0; i < head_dim; ++i) {
-                        sum += h[i * head_dim + j] * q[i];
-                    }
-                    y[j] = sum;
-                }
-
-                // Debug output
-                if (do_debug && ih == 0 && it == 0 && i3 == 0) {
-                    // Find max abs value in h for stability check
-                    float h_max = 0.0f;
-                    for (int i = 0; i < head_dim * head_dim; i++) {
-                        if (fabsf(h[i]) > h_max) h_max = fabsf(h[i]);
-                    }
-                    fprintf(stderr, "DEBUG KDA: y[0]=%.6f, h_max=%.6f, exp(g[0])=%.6f\n",
-                            y[0], h_max, expf(g[0]));
-                }
-            }
-        }
-    }
-
-    free(hk_buf);
-}
-
-void ggml_compute_forward_kda_scan(
-        const ggml_compute_params * params,
-        ggml_tensor * dst) {
-    switch (dst->src[0]->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_kda_scan_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
 // ggml_compute_forward_win_part
 
 static void ggml_compute_forward_win_part_f32(
diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
index 080cf6e090..0fdfee7976 100644
--- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
@@ -92,7 +92,6 @@ void ggml_compute_forward_flash_attn_back(
         struct ggml_tensor * dst);
 void ggml_compute_forward_ssm_conv(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_ssm_scan(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_kda_scan(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_win_part(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_win_unpart(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_unary(const struct ggml_compute_params * params, struct ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 8e0d7d916e..bbd7810dd2 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -41,7 +41,6 @@
 #include "ggml-cuda/softmax.cuh"
 #include "ggml-cuda/ssm-conv.cuh"
 #include "ggml-cuda/ssm-scan.cuh"
-#include "ggml-cuda/kda-scan.cuh"
 #include "ggml-cuda/sum.cuh"
 #include "ggml-cuda/sumrows.cuh"
 #include "ggml-cuda/mean.cuh"
@@ -2693,9 +2692,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_SSM_SCAN:
             ggml_cuda_op_ssm_scan(ctx, dst);
             break;
-        case GGML_OP_KDA_SCAN:
-            ggml_cuda_op_kda_scan(ctx, dst);
-            break;
         case GGML_OP_ARGSORT:
             ggml_cuda_op_argsort(ctx, dst);
             break;
diff --git a/ggml/src/ggml-cuda/kda-scan.cu b/ggml/src/ggml-cuda/kda-scan.cu
deleted file mode 100644
index 5763f1cc90..0000000000
--- a/ggml/src/ggml-cuda/kda-scan.cu
+++ /dev/null
@@ -1,209 +0,0 @@
-#include "kda-scan.cuh"
-
-// KDA (Kimi Delta Attention) scan CUDA kernel
-// Recurrence:
-//   h[t] = exp(g[t]) * h[t-1] + k[t]^T * (beta[t] * (v[t] - h[t-1] @ k[t]))
-//   o[t] = q[t]^T @ h[t]
-// 
-// This kernel uses global memory for the hidden state to avoid shared memory limits.
-// Each block processes one head for one sequence.
-
-__global__ void kda_scan_f32_kernel(
-    const float * __restrict__ src0,   // h:    [head_dim, head_dim, n_head, n_seqs+]
-    const float * __restrict__ src1,   // q:    [head_dim, n_head, n_seq_tokens, n_seqs]
-    const float * __restrict__ src2,   // k:    [head_dim, n_head, n_seq_tokens, n_seqs]
-    const float * __restrict__ src3,   // v:    [head_dim, n_head, n_seq_tokens, n_seqs]
-    const float * __restrict__ src4,   // g:    [head_dim, n_head, n_seq_tokens, n_seqs]
-    const float * __restrict__ src5,   // beta: [n_head, n_seq_tokens, n_seqs]
-    const int32_t * __restrict__ src6, // ids:  [n_seqs]
-    float * __restrict__ dst,
-    const int64_t head_dim,
-    const int64_t n_head,
-    const int64_t n_seq_tokens,
-    const int64_t n_seqs,
-    const int64_t y_off)  // offset to state output in dst (in floats)
-{
-    // Each block handles one head for one sequence
-    const int seq_idx = blockIdx.x / n_head;
-    const int head_idx = blockIdx.x % n_head;
-    const int tid = threadIdx.x;
-    const int n_threads = blockDim.x;
-    
-    if (seq_idx >= n_seqs || head_idx >= n_head) return;
-    
-    // Get sequence ID for initial state
-    const int src_seq = src6[seq_idx];
-    
-    // Shared memory for temporary buffers
-    extern __shared__ float smem[];
-    float * hk_buf = smem;                    // [head_dim] - h @ k buffer
-    float * q_norm = smem + head_dim;         // [head_dim] - normalized q
-    float * k_norm = q_norm + head_dim;       // [head_dim] - normalized k
-    float * warp_sums = k_norm + head_dim;    // [64] - for reductions
-    
-    // Pointers to input/output data for this head
-    const int64_t h_stride_head = head_dim * head_dim;
-    const int64_t h_stride_seq = h_stride_head * n_head;
-    const int64_t qkv_stride_head = head_dim;
-    const int64_t qkv_stride_token = head_dim * n_head;
-    const int64_t qkv_stride_seq = qkv_stride_token * n_seq_tokens;
-    const int64_t beta_stride_token = n_head;
-    const int64_t beta_stride_seq = beta_stride_token * n_seq_tokens;
-    
-    const float * h_in = src0 + src_seq * h_stride_seq + head_idx * h_stride_head;
-    float * h_out = dst + y_off + seq_idx * h_stride_seq + head_idx * h_stride_head;
-    float * y_out = dst + seq_idx * qkv_stride_seq + head_idx * qkv_stride_head;
-    
-    // Copy initial state to output (we'll update in place)
-    for (int i = tid; i < head_dim * head_dim; i += n_threads) {
-        float val = h_in[i];
-        if (!isfinite(val) || fabsf(val) > 1e6f) {
-            val = 0.0f;
-        }
-        h_out[i] = val;
-    }
-    __syncthreads();
-    
-    const float scale = 1.0f / sqrtf((float)head_dim);
-    
-    // Process each token sequentially
-    for (int t = 0; t < n_seq_tokens; ++t) {
-        const float * q_raw = src1 + t * qkv_stride_token + seq_idx * qkv_stride_seq + head_idx * qkv_stride_head;
-        const float * k_raw = src2 + t * qkv_stride_token + seq_idx * qkv_stride_seq + head_idx * qkv_stride_head;
-        const float * v = src3 + t * qkv_stride_token + seq_idx * qkv_stride_seq + head_idx * qkv_stride_head;
-        const float * g = src4 + t * qkv_stride_token + seq_idx * qkv_stride_seq + head_idx * qkv_stride_head;
-        const float beta = src5[t * beta_stride_token + seq_idx * beta_stride_seq + head_idx];
-        float * y = y_out + t * qkv_stride_token;
-        
-        // Step 1: L2 normalize q and k
-        float q_sq_sum = 0.0f, k_sq_sum = 0.0f;
-        for (int i = tid; i < head_dim; i += n_threads) {
-            q_sq_sum += q_raw[i] * q_raw[i];
-            k_sq_sum += k_raw[i] * k_raw[i];
-        }
-        
-        // Warp reduction
-        for (int offset = warpSize/2; offset > 0; offset /= 2) {
-            q_sq_sum += __shfl_down_sync(0xffffffff, q_sq_sum, offset);
-            k_sq_sum += __shfl_down_sync(0xffffffff, k_sq_sum, offset);
-        }
-        
-        // Cross-warp reduction
-        int warp_id = tid / warpSize;
-        int lane_id = tid % warpSize;
-        if (lane_id == 0 && warp_id < 32) {
-            warp_sums[warp_id] = q_sq_sum;
-            warp_sums[32 + warp_id] = k_sq_sum;
-        }
-        __syncthreads();
-        
-        if (tid == 0) {
-            float total_q = 0.0f, total_k = 0.0f;
-            for (int i = 0; i < (n_threads + warpSize - 1) / warpSize; ++i) {
-                total_q += warp_sums[i];
-                total_k += warp_sums[32 + i];
-            }
-            warp_sums[0] = rsqrtf(total_q + 1e-6f) * scale;
-            warp_sums[1] = rsqrtf(total_k + 1e-6f);
-        }
-        __syncthreads();
-        
-        float q_norm_factor = warp_sums[0];
-        float k_norm_factor = warp_sums[1];
-        
-        // Store normalized q and k
-        for (int i = tid; i < head_dim; i += n_threads) {
-            q_norm[i] = q_raw[i] * q_norm_factor;
-            k_norm[i] = k_raw[i] * k_norm_factor;
-        }
-        __syncthreads();
-        
-        // KDA recurrence: h[t] = exp(g[t]) * h[t-1] + k[t]^T * (beta[t] * (v[t] - h[t-1] @ k[t]))
-        // Apply decay first, then compute retrieval and update
-        
-        // Step 2: Apply decay to h: h = h * exp(g)
-        for (int idx = tid; idx < head_dim * head_dim; idx += n_threads) {
-            int i = idx / head_dim;
-            float exp_gi = expf(g[i]);
-            h_out[idx] *= exp_gi;
-        }
-        __syncthreads();
-        
-        // Step 3: Compute h^T @ k -> hk_buf
-        for (int j = tid; j < head_dim; j += n_threads) {
-            float sum = 0.0f;
-            for (int i = 0; i < head_dim; ++i) {
-                sum += h_out[i * head_dim + j] * k_norm[i];
-            }
-            hk_buf[j] = sum;
-        }
-        __syncthreads();
-        
-        // Step 4: Update h: h = h + outer(k, beta * (v - hk))
-        for (int idx = tid; idx < head_dim * head_dim; idx += n_threads) {
-            int i = idx / head_dim;
-            int j = idx % head_dim;
-            float delta_j = beta * (v[j] - hk_buf[j]);
-            h_out[idx] += k_norm[i] * delta_j;
-        }
-        __syncthreads();
-        
-        // Step 5: Compute output y = h^T @ q
-        for (int j = tid; j < head_dim; j += n_threads) {
-            float sum = 0.0f;
-            for (int i = 0; i < head_dim; ++i) {
-                sum += h_out[i * head_dim + j] * q_norm[i];
-            }
-            y[j] = sum;
-        }
-        __syncthreads();
-    }
-}
-
-void ggml_cuda_op_kda_scan(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0]; // h
-    const ggml_tensor * src1 = dst->src[1]; // q
-    const ggml_tensor * src2 = dst->src[2]; // k
-    const ggml_tensor * src3 = dst->src[3]; // v
-    const ggml_tensor * src4 = dst->src[4]; // g
-    const ggml_tensor * src5 = dst->src[5]; // beta
-    const ggml_tensor * src6 = dst->src[6]; // ids
-    
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(src2->type == GGML_TYPE_F32);
-    GGML_ASSERT(src3->type == GGML_TYPE_F32);
-    GGML_ASSERT(src4->type == GGML_TYPE_F32);
-    GGML_ASSERT(src5->type == GGML_TYPE_F32);
-    GGML_ASSERT(src6->type == GGML_TYPE_I32);
-    
-    const int64_t head_dim = src0->ne[0];
-    const int64_t n_head = src1->ne[1];
-    const int64_t n_seq_tokens = src1->ne[2];
-    const int64_t n_seqs = src1->ne[3];
-    
-    // Output offset for hidden state (after attention output) - in floats
-    const int64_t y_off = ggml_nelements(src1);
-    
-    const float * h_d = (const float *)src0->data;
-    const float * q_d = (const float *)src1->data;
-    const float * k_d = (const float *)src2->data;
-    const float * v_d = (const float *)src3->data;
-    const float * g_d = (const float *)src4->data;
-    const float * beta_d = (const float *)src5->data;
-    const int32_t * ids_d = (const int32_t *)src6->data;
-    float * dst_d = (float *)dst->data;
-    
-    cudaStream_t stream = ctx.stream();
-    
-    // Launch kernel: one block per (sequence, head) pair
-    const int n_blocks = n_seqs * n_head;
-    const int n_threads = 128;
-    
-    // Shared memory: hk_buf[head_dim] + q_norm[head_dim] + k_norm[head_dim] + warp_sums[64]
-    size_t smem_size = (3 * head_dim + 64) * sizeof(float);
-    
-    kda_scan_f32_kernel<<<n_blocks, n_threads, smem_size, stream>>>(
-        h_d, q_d, k_d, v_d, g_d, beta_d, ids_d, dst_d,
-        head_dim, n_head, n_seq_tokens, n_seqs, y_off);
-}
diff --git a/ggml/src/ggml-cuda/kda-scan.cuh b/ggml/src/ggml-cuda/kda-scan.cuh
deleted file mode 100644
index 55783fb82b..0000000000
--- a/ggml/src/ggml-cuda/kda-scan.cuh
+++ /dev/null
@@ -1,3 +0,0 @@
-#include "common.cuh"
-
-void ggml_cuda_op_kda_scan(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 1703cad5ab..a167d6a574 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -5435,69 +5435,6 @@ struct ggml_tensor * ggml_ssm_scan(
     return result;
 }
 
-// ggml_kda_scan
-
-struct ggml_tensor * ggml_kda_scan(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * h,
-        struct ggml_tensor  * q,
-        struct ggml_tensor  * k,
-        struct ggml_tensor  * v,
-        struct ggml_tensor  * g,
-        struct ggml_tensor  * beta,
-        struct ggml_tensor  * ids) {
-    GGML_ASSERT(ggml_is_contiguous(h));
-    GGML_ASSERT(ggml_is_contiguous(q));
-    GGML_ASSERT(ggml_is_contiguous(k));
-    GGML_ASSERT(ggml_is_contiguous(v));
-    GGML_ASSERT(ggml_is_contiguous(g));
-    GGML_ASSERT(ggml_is_contiguous(beta));
-    GGML_ASSERT(ids->type == GGML_TYPE_I32);
-
-    {
-        const int64_t head_dim     = h->ne[0];
-        const int64_t n_head       = q->ne[1];
-        const int64_t n_seq_tokens = q->ne[2];
-        const int64_t n_seqs       = q->ne[3];
-
-        GGML_ASSERT(h->ne[0] == head_dim);
-        GGML_ASSERT(h->ne[1] == head_dim);
-        GGML_ASSERT(h->ne[2] == n_head);
-        GGML_ASSERT(q->ne[0] == head_dim);
-        GGML_ASSERT(k->ne[0] == head_dim);
-        GGML_ASSERT(v->ne[0] == head_dim);
-        GGML_ASSERT(g->ne[0] == head_dim);
-        GGML_ASSERT(ggml_are_same_shape(q, k));
-        GGML_ASSERT(ggml_are_same_shape(q, v));
-        GGML_ASSERT(ggml_are_same_shape(q, g));
-        GGML_ASSERT(beta->ne[0] == n_head);
-        GGML_ASSERT(beta->ne[1] == n_seq_tokens);
-        GGML_ASSERT(beta->ne[2] == n_seqs);
-        GGML_ASSERT(ids->ne[0] == n_seqs);
-        GGML_ASSERT(ggml_is_vector(ids));
-    }
-
-    // Output: y (attention output) + updated hidden states
-    // y: {head_dim, n_head, n_seq_tokens, n_seqs}
-    // h_new: {head_dim, head_dim, n_head, n_seqs}
-    const int64_t head_dim = h->ne[0];
-    const int64_t n_head = q->ne[1];
-    const int64_t n_seqs = q->ne[3];
-    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 
-        ggml_nelements(q) + head_dim * head_dim * n_head * n_seqs);
-
-    result->op   = GGML_OP_KDA_SCAN;
-    result->src[0] = h;
-    result->src[1] = q;
-    result->src[2] = k;
-    result->src[3] = v;
-    result->src[4] = g;
-    result->src[5] = beta;
-    result->src[6] = ids;
-
-    return result;
-}
-
 // ggml_win_part
 
 struct ggml_tensor * ggml_win_part(

From 776294c04e506ce7d29ff1e0481ea3d371838668 Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Fri, 19 Dec 2025 07:36:06 +0800
Subject: [PATCH 17/58] removed all traces of kda_scan

---
 ggml/include/ggml.h             | 23 -----------------------
 ggml/src/ggml-cpu/ggml-cpu.c    |  1 -
 ggml/src/ggml-cuda/ggml-cuda.cu |  5 -----
 ggml/src/ggml.c                 |  1 -
 4 files changed, 30 deletions(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 888f00c2e8..48da68fe7e 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -539,7 +539,6 @@ extern "C" {
         GGML_OP_FLASH_ATTN_BACK,
         GGML_OP_SSM_CONV,
         GGML_OP_SSM_SCAN,
-        GGML_OP_KDA_SCAN,
         GGML_OP_WIN_PART,
         GGML_OP_WIN_UNPART,
         GGML_OP_GET_REL_POS,
@@ -2338,28 +2337,6 @@ extern "C" {
             struct ggml_tensor  * C,
             struct ggml_tensor  * ids);
 
-    // KDA (Kimi Delta Attention) scan
-    // Delta attention recurrence:
-    //   h[t] = exp(g[t]) * h[t-1] + k[t]^T * (beta[t] * (v[t] - h[t-1] @ k[t]))
-    //   o[t] = q[t]^T @ h[t]
-    // Parameters:
-    //   h:    hidden state {head_dim, head_dim, n_head, n_seqs+}
-    //   q:    query        {head_dim, n_head, n_seq_tokens, n_seqs}
-    //   k:    key          {head_dim, n_head, n_seq_tokens, n_seqs}
-    //   v:    value        {head_dim, n_head, n_seq_tokens, n_seqs}
-    //   g:    gate         {head_dim, n_head, n_seq_tokens, n_seqs}
-    //   beta: mixing       {n_head, n_seq_tokens, n_seqs}
-    //   ids:  seq indices  {n_seqs}
-    GGML_API struct ggml_tensor * ggml_kda_scan(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * h,
-            struct ggml_tensor  * q,
-            struct ggml_tensor  * k,
-            struct ggml_tensor  * v,
-            struct ggml_tensor  * g,
-            struct ggml_tensor  * beta,
-            struct ggml_tensor  * ids);
-
     // partition into non-overlapping windows with padding if needed
     // example:
     // a:   768   64   64    1
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 4cc15b0981..3247af8bb0 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2320,7 +2320,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
         case GGML_OP_FLASH_ATTN_BACK:
         case GGML_OP_SSM_CONV:
         case GGML_OP_SSM_SCAN:
-        case GGML_OP_KDA_SCAN:
         case GGML_OP_RWKV_WKV6:
         case GGML_OP_GATED_LINEAR_ATTN:
         case GGML_OP_RWKV_WKV7:
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index bbd7810dd2..eb2e273110 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -4503,11 +4503,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                 return op->src[0]->ne[0] == 16 && op->src[0]->ne[1] == 1 && op->src[0]->ne[2] % 128 == 0 && op->src[4]->ne[1] == 1;
             }
         }
-        case GGML_OP_KDA_SCAN: {
-            // KDA scan kernel supports head_dim 64 or 128
-            const int64_t head_dim = op->src[0]->ne[0];
-            return head_dim == 64 || head_dim == 128;
-        }
         case GGML_OP_SSM_CONV: {
             // assumes d_inner % threads == 0
             return op->src[0]->ne[1] % 128 == 0;
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index a167d6a574..173ec6b98f 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -999,7 +999,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "FLASH_ATTN_BACK",
     "SSM_CONV",
     "SSM_SCAN",
-    "KDA_SCAN",
     "WIN_PART",
     "WIN_UNPART",
     "GET_REL_POS",

From f67a42d57297c93b742bc4a85886e367ccabbd09 Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Fri, 19 Dec 2025 07:37:33 +0800
Subject: [PATCH 18/58] reduce OP count by 1 due to removal of kda_scan

---
 ggml/src/ggml.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 173ec6b98f..17cf4d84bb 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1024,7 +1024,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "GLU",
 };
 
-static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
+static_assert(GGML_OP_COUNT == 95, "GGML_OP_COUNT != 95");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -1133,7 +1133,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "glu(x)",
 };
 
-static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
+static_assert(GGML_OP_COUNT == 95, "GGML_OP_COUNT != 95");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 

From f85e5c73b91370d605f4835c2f6112fb729006cd Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Fri, 2 Jan 2026 21:20:34 +0800
Subject: [PATCH 19/58] Move KIMI_LINEAR to llm_arch_is_hybrid to enable KV
 cache

---
 src/llama-arch.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 6aabdb7f7d..cf5ea1177f 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -2863,7 +2863,6 @@ bool llm_arch_is_recurrent(const llm_arch & arch) {
         case LLM_ARCH_RWKV6QWEN2:
         case LLM_ARCH_RWKV7:
         case LLM_ARCH_ARWKV7:
-        case LLM_ARCH_KIMI_LINEAR:  // KDA layers use delta attention with recurrent state
             return true;
         default:
             return false;
@@ -2880,9 +2879,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
         case LLM_ARCH_LFM2MOE:
         case LLM_ARCH_NEMOTRON_H:
         case LLM_ARCH_QWEN3NEXT:
-        // Kimi: Currently using recurrent-only mode since MLA doesn't use KV cache
-        // TODO: Enable hybrid when MLA KV caching is implemented
-        // case LLM_ARCH_KIMI_LINEAR:
+        case LLM_ARCH_KIMI_LINEAR:
             return true;
         default:
             return false;

From 8bd617eb1c1afc0b267e0e8d72c7db3bc801db7d Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Sat, 3 Jan 2026 08:26:41 +0800
Subject: [PATCH 20/58] set n_embd_head_k/v to ensure kv cache works

---
 convert_hf_to_gguf.py | 114 +++++++++++++++++++++---------------------
 1 file changed, 58 insertions(+), 56 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index c6724f2ed5..0a59dc2c5f 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -4987,10 +4987,65 @@ class KimiLinearModel(TextModel):
     
     _experts: list[dict[str, Tensor]] | None = None
 
+    def set_vocab(self):
+        try:
+            self._set_vocab_gpt2()
+            return
+        except Exception:
+            pass
+
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
+        tokpre = self.get_vocab_base_pre(tokenizer)
+
+        if tokpre == "kimi-k2":
+            # Build merges list using the approach similar to HunYuanMoE
+            merges = []
+            vocab = {}
+            mergeable_ranks = tokenizer.model._mergeable_ranks
+            for token, rank in mergeable_ranks.items():
+                vocab[QwenModel.token_bytes_to_string(token)] = rank
+                if len(token) == 1:
+                    continue
+                merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
+                if len(merged) == 2:
+                    merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
+
+            # Build token list
+            vocab_size = self.hparams["vocab_size"]
+            special_tokens = tokenizer.special_tokens
+            reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
+            tokens: list[str] = []
+            toktypes: list[int] = []
+
+            for i in range(vocab_size):
+                if i not in reverse_vocab:
+                    tokens.append(f"[PAD{i}]")
+                    toktypes.append(gguf.TokenType.UNUSED)
+                else:
+                    token = reverse_vocab[i]
+                    tokens.append(token)
+                    if i in special_tokens.values():
+                        toktypes.append(gguf.TokenType.CONTROL)
+                    else:
+                        toktypes.append(gguf.TokenType.NORMAL)
+
+            self.gguf_writer.add_tokenizer_model("gpt2")
+            self.gguf_writer.add_tokenizer_pre(tokpre)
+            self.gguf_writer.add_token_list(tokens)
+            self.gguf_writer.add_token_types(toktypes)
+            self.gguf_writer.add_token_merges(merges)
+
+            special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
+            special_vocab.add_to_gguf(self.gguf_writer)
+            # override eos id in config.json with tiktoken eos id
+            self.gguf_writer.add_eos_token_id(tokenizer.eos_id)
+        else:
+            raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!")
+
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
         self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
-        self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"])
         self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
  
         # Use find_hparam for context length
@@ -5043,8 +5098,9 @@ class KimiLinearModel(TextModel):
         # Support HuggingFace naming: qk_nope_head_dim, qk_rope_head_dim, v_head_dim
         qk_nope_head_dim = self.hparams.get("qk_nope_head_dim")
         qk_rope_head_dim = self.hparams.get("qk_rope_head_dim")
+        self.gguf_writer.add_key_length(qk_nope_head_dim + qk_rope_head_dim)
         v_head_dim = self.hparams.get("v_head_dim")
-        self.gguf_writer.add_rope_dimension_count(self.hparams["qk_rope_head_dim"])
+        self.gguf_writer.add_value_length(v_head_dim)
         
         # Calculate n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim
         if "n_embd_head_k_mla" in self.hparams:
@@ -5106,60 +5162,6 @@ class KimiLinearModel(TextModel):
         if routed_scaling_factor is not None:
             self.gguf_writer.add_expert_weights_scale(routed_scaling_factor)
 
-    def set_vocab(self):
-        try:
-            self._set_vocab_gpt2()
-            return
-        except Exception:
-            pass
-
-        from transformers import AutoTokenizer
-        tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
-        tokpre = self.get_vocab_base_pre(tokenizer)
-
-        if tokpre == "kimi-k2":
-            # Build merges list using the approach similar to HunYuanMoE
-            merges = []
-            vocab = {}
-            mergeable_ranks = tokenizer.model._mergeable_ranks
-            for token, rank in mergeable_ranks.items():
-                vocab[QwenModel.token_bytes_to_string(token)] = rank
-                if len(token) == 1:
-                    continue
-                merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
-                if len(merged) == 2:
-                    merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
-            
-            # Build token list
-            vocab_size = self.hparams["vocab_size"] 
-            special_tokens = tokenizer.special_tokens
-            reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}    
-            tokens: list[str] = []
-            toktypes: list[int] = []
-
-            for i in range(vocab_size):
-                if i not in reverse_vocab:
-                    tokens.append(f"[PAD{i}]")
-                    toktypes.append(gguf.TokenType.UNUSED)
-                else:
-                    token = reverse_vocab[i]
-                    tokens.append(token)
-                    if i in special_tokens.values():
-                        toktypes.append(gguf.TokenType.CONTROL)
-                    else:
-                        toktypes.append(gguf.TokenType.NORMAL)
-
-            self.gguf_writer.add_tokenizer_model("gpt2")
-            self.gguf_writer.add_tokenizer_pre(tokpre)
-            self.gguf_writer.add_token_list(tokens)
-            self.gguf_writer.add_token_types(toktypes)
-            self.gguf_writer.add_token_merges(merges)
-
-            special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
-            special_vocab.add_to_gguf(self.gguf_writer)
-        else:
-            raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!")
-
     def prepare_tensors(self):
         super().prepare_tensors()
         if self._experts is not None:

From a4020d867f55bf4721ddd1c5df713f4ade14ad49 Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Sat, 3 Jan 2026 08:27:29 +0800
Subject: [PATCH 21/58] don't quantize conv1d of Kimi Linear

---
 src/llama-quant.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 7b8bf6e524..bae907f92c 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -869,9 +869,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD,    "weight");
         quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
 
-        // do not quantize Mamba's small yet 2D weights
+        // do not quantize Mamba /Kimi's small conv1d weights
         // NOTE: can't use LLM_TN here because the layer number is not known
-        quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
+        quantize &= name.find("ssm_conv1d") == std::string::npos;
         quantize &= name.find("shortconv.conv.weight") == std::string::npos;
 
         // do not quantize RWKV's small yet 2D weights

From 66c0c5d8d400296357b8a4df4438c0acd74c3a05 Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Mon, 5 Jan 2026 16:35:19 +0800
Subject: [PATCH 22/58] Kimi Linear backend agnostic

---
 src/models/kimi-linear.cpp | 493 ++++++++++++++++++++++++++++++++-----
 src/models/models.h        |  21 ++
 2 files changed, 450 insertions(+), 64 deletions(-)

diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp
index d025eab5f3..32a723b80a 100644
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@@ -1,24 +1,35 @@
 #include "models.h"
+#include "ggml.h"
+#include "llama-impl.h"
+
+#define CHUNK_SIZE 64
 
 llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params), model(model) {
     ggml_tensor * cur;
     ggml_tensor * inpL;
 
     inpL = build_inp_embd(model.tok_embd);
-    
+    cb(inpL, "model.embed_tokens", -1);
+
     // Note: Kimi MLA does NOT use RoPE (rotary_emb=None in vLLM)
     // So we don't need inp_pos
     
-    // Only use recurrent state input for KDA layers
-    // MLA layers use direct softmax attention without KV cache
-    auto * inp_rs = build_rs_inp();
-    
-    // Input for MLA layers (no KV cache)
-    auto * inp_no_cache = build_attn_inp_no_cache();
+    auto * inp = build_inp_mem_hybrid();
+    auto * inp_rs = inp->get_recr();
+    auto * inp_attn = inp->get_attn();
 
     // Output ids for selecting which tokens to output
     ggml_tensor * inp_out_ids = build_inp_out_ids();
 
+    ggml_tensor * causal_mask =
+        ggml_tri(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, ubatch.n_seq_tokens, ubatch.n_seq_tokens), 1.0f),
+                    GGML_TRI_TYPE_LOWER);
+
+    ggml_tensor * identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, ubatch.n_seq_tokens), 1.0f));
+
+    ggml_build_forward_expand(gf, causal_mask);
+    ggml_build_forward_expand(gf, identity);
+
     // Kimi dimension constants
     const int64_t n_head = hparams.n_head();
     const int64_t head_dim = hparams.kda_head_dim;
@@ -40,10 +51,6 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
     // Confirmed from tensor shape: wkv_a_mqa [2304, 576] = [n_embd, kv_lora_rank + qk_rope_head_dim]
     const int64_t n_embd_head_qk_rope = hparams.n_rot;  // config.qk_rope_head_dim
     const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;  // 192 - 64 = 128
-    
-    // Attention scale for KDA (1/sqrt(head_dim))
-    const float kq_scale_kda = 1.0f / sqrtf((float)head_dim);
-    
     // Attention scale for MLA
     const float kq_scale_mla = 1.0f / sqrtf((float)n_embd_head_k_mla);
 
@@ -51,6 +58,8 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
         const auto & layer = model.layers[il];
         ggml_tensor * inpSA = inpL;
 
+        if (!layer.attn_norm)
+            LLAMA_LOG_INFO("Empty attn_norm at layer %d\n", il);
         // Attention Norm
         cur = build_norm(inpL, layer.attn_norm, NULL, LLM_NORM_RMS, il);
         cb(cur, "attn_norm", il);
@@ -69,6 +78,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
             
             // Get conv states from r_l tensor (Q, K, V each have separate state)
             ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
+            cb(conv_states_all, "conv_states_all", il);
             const int64_t conv_state_size = (d_conv - 1) * d_inner;
             const int64_t n_embd_r_total = 3 * conv_state_size;  // Q + K + V
             ggml_tensor * conv_state_all = build_rs(inp_rs, conv_states_all, hparams.n_embd_r(), n_seqs);
@@ -143,12 +153,14 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
                 
                 // ggml_ssm_conv output: {d_inner, n_seq_tokens, n_seqs}
                 Qcur = ggml_ssm_conv(ctx0, conv_q, conv_weight);
+                cb(Qcur, "Q conv1d", il);
                 // Reshape to 2D for bias add: {d_inner, n_tokens}
                 Qcur = ggml_reshape_2d(ctx0, Qcur, d_inner, n_tokens);
                 if (layer.ssm_q_conv_b) {
                     Qcur = ggml_add(ctx0, Qcur, layer.ssm_q_conv_b);
                 }
                 Qcur = ggml_silu(ctx0, Qcur);
+                cb(Qcur, "Q conv1d b", il);
             } else {
                 GGML_ABORT("KDA layer missing Q conv weight");
             }
@@ -173,11 +185,13 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
                 }
                 ggml_tensor * k_conv_weight = ggml_reshape_2d(ctx0, k_conv_f32, d_conv, d_inner);
                 Kcur = ggml_ssm_conv(ctx0, conv_k, k_conv_weight);
+                cb(Kcur, "K conv1d", il);
                 Kcur = ggml_reshape_2d(ctx0, Kcur, d_inner, n_tokens);
                 if (layer.ssm_k_conv_b) {
                     Kcur = ggml_add(ctx0, Kcur, layer.ssm_k_conv_b);
                 }
                 Kcur = ggml_silu(ctx0, Kcur);
+                cb(Kcur, "K conv1d b", il);
             } else {
                 GGML_ABORT("KDA layer missing K conv weight");
             }
@@ -202,11 +216,13 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
                 }
                 ggml_tensor * v_conv_weight = ggml_reshape_2d(ctx0, v_conv_f32, d_conv, d_inner);
                 Vcur = ggml_ssm_conv(ctx0, conv_v, v_conv_weight);
+                cb(Vcur, "V conv1d", il);
                 Vcur = ggml_reshape_2d(ctx0, Vcur, d_inner, n_tokens);
                 if (layer.ssm_v_conv_b) {
                     Vcur = ggml_add(ctx0, Vcur, layer.ssm_v_conv_b);
                 }
                 Vcur = ggml_silu(ctx0, Vcur);
+                cb(Vcur, "V conv1d b", il);
             } else {
                 GGML_ABORT("KDA layer missing V conv weight");
             }
@@ -215,6 +231,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
             // g1 = -exp(A_log) * softplus(f_b(f_a(x)) + dt_bias)
             ggml_tensor * f_a = ggml_mul_mat(ctx0, layer.ssm_f_a, cur);
             ggml_tensor * g1 = ggml_mul_mat(ctx0, layer.ssm_f_b, f_a);
+            cb(g1, "g1 f_b(f_a(cur))", il);
             g1 = ggml_add(ctx0, g1, layer.ssm_dt_b);
             g1 = ggml_softplus(ctx0, g1);
             g1 = ggml_reshape_3d(ctx0, g1, head_dim, n_head, n_tokens);
@@ -229,7 +246,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
             
             // Step 4: Compute beta (mixing coefficient)
             ggml_tensor * beta = ggml_mul_mat(ctx0, layer.ssm_beta, cur);
-            beta = ggml_sigmoid(ctx0, beta);
+            beta = ggml_cont_4d(ctx0, beta, n_head, 1, n_seq_tokens, n_seqs);
             cb(beta, "kda_beta", il);
             
             // Step 5: Reshape for KDA recurrence
@@ -240,49 +257,56 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
             Kcur = ggml_cont(ctx0, ggml_reshape_4d(ctx0, Kcur, head_dim, n_head, n_seq_tokens, n_seqs));
             Vcur = ggml_cont(ctx0, ggml_reshape_4d(ctx0, Vcur, head_dim, n_head, n_seq_tokens, n_seqs));
             g1 = ggml_cont(ctx0, ggml_reshape_4d(ctx0, g1, head_dim, n_head, n_seq_tokens, n_seqs));
-            beta = ggml_cont(ctx0, ggml_reshape_3d(ctx0, beta, n_head, n_seq_tokens, n_seqs));
-            
             cb(Qcur, "kda_Q", il);
             cb(Kcur, "kda_K", il);
             cb(Vcur, "kda_V", il);
-            
+
             // Step 6: Get SSM state and compute KDA recurrence using ggml_kda_scan
             ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
-            
-            // Use build_rs with lambda pattern (like Mamba SSM scan)
-            auto get_kda_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
-                ggml_tensor * h_state = ggml_reshape_4d(ctx, states, head_dim, head_dim, n_head, mctx_cur->get_size());
-                // Call ggml_kda_scan which implements the correct KDA recurrence
-                return ggml_kda_scan(ctx, h_state, Qcur, Kcur, Vcur, g1, beta, ids);
-            };
-            
-            ggml_tensor * y_kda = build_rs(inp_rs, ssm_states_all, hparams.n_embd_s(), n_seqs, get_kda_rows);
-            cb(y_kda, "kda_scan_out", il);
-            
-            // Store updated state back
-            // y_kda contains: [attention_output (head_dim * n_head * n_seq_tokens * n_seqs), new_state (head_dim * head_dim * n_head * n_seqs)]
-            const int64_t attn_out_size = head_dim * n_head * n_seq_tokens * n_seqs;
-            const int64_t state_size = head_dim * head_dim * n_head;
-            ggml_build_forward_expand(gf, 
-                ggml_cpy(ctx0, 
-                    ggml_view_1d(ctx0, y_kda, state_size * n_seqs, attn_out_size * ggml_element_size(y_kda)),
-                    ggml_view_1d(ctx0, ssm_states_all, state_size * n_seqs, kv_head * state_size * ggml_element_size(ssm_states_all))));
-            
-            // Extract attention output
-            ggml_tensor * attn_out = ggml_view_1d(ctx0, y_kda, attn_out_size, 0);
-            attn_out = ggml_reshape_3d(ctx0, attn_out, head_dim, n_head, n_seq_tokens * n_seqs);
-            cb(attn_out, "kda_attn_out", il);
-            
+            ggml_tensor * state = build_rs(inp_rs, ssm_states_all, hparams.n_embd_s(), n_seqs);
+            state = ggml_reshape_4d(ctx0, state, head_dim, head_dim, n_head, n_seqs);
+            // Choose between build_kda_chunking and build_kda_recurrent based on n_tokens
+            // TODO: Currently only build_kda_recurrent is implemented
+            ggml_tensor * attn_out = n_seq_tokens > CHUNK_SIZE ?
+                build_kda_recurrent(Qcur, Kcur, Vcur, g1, beta, state, causal_mask, identity, il) : 
+                build_kda_recurrent(Qcur, Kcur, Vcur, g1, beta, state, causal_mask, identity, il);
+            cb(attn_out, "attn_out", il);
+
+           // The tensors were concatenated 1d, so we need to extract them 1d as well
+           const int64_t output_flat_size = head_dim * n_head * n_seq_tokens * n_seqs;
+           ggml_tensor * attn_out_1d      = ggml_view_1d(ctx0, attn_out, output_flat_size, 0);
+           cb(attn_out_1d, "attn_out_1d", il);
+        
+           ggml_tensor * attn_out_final = ggml_reshape_3d(ctx0, attn_out_1d, head_dim, n_head, n_seq_tokens * n_seqs);
+           cb(attn_out_final, "attn_out_reshaped", il);
+           // Extract the state part (second part of the concatenated tensor)
+           // State starts after n_tokens elements along dimension 1
+           const int64_t state_flat_size = head_dim * head_dim * n_head * n_seqs;
+
+           ggml_tensor * state_1d =
+               ggml_view_1d(ctx0, attn_out, state_flat_size, output_flat_size * ggml_element_size(attn_out));
+           cb(state_1d, "state_1d", il);
+
+           // Update the recurrent states
+           ggml_build_forward_expand(gf,
+                                     ggml_cpy(ctx0, state_1d,
+                                              ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs,
+                                                           kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all))));
+
+           GGML_ASSERT(ggml_nelements(attn_out_1d) + ggml_nelements(state_1d) == ggml_nelements(attn_out));
+
             // Step 7: Output gating g2 = g_b(g_a(x))
             ggml_tensor * cur_2d = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
             ggml_tensor * g_a = ggml_mul_mat(ctx0, layer.ssm_g_a, cur_2d);
             ggml_tensor * g2 = ggml_mul_mat(ctx0, layer.ssm_g_b, g_a);
+            cb(g2, "g2 g_b(g_a(cur_2d))", il);
             g2 = ggml_reshape_3d(ctx0, g2, head_dim, n_head, n_seq_tokens * n_seqs);
             
             // Step 8: Apply o_norm with sigmoid gating
             // Note: Kimi model uses sigmoid gating, not SiLU (despite FusedRMSNormGated default being swish)
             // Formula: output = RMSNorm(x) * sigmoid(g)
-            ggml_tensor * normed = build_norm(attn_out, layer.ssm_o_norm, layer.ssm_o_norm_b, LLM_NORM_RMS, il);
+            ggml_tensor * normed = build_norm(attn_out_final, layer.ssm_o_norm, layer.ssm_o_norm_b, LLM_NORM_RMS, il);
+            cb(normed, "kda_normed", il);
             ggml_tensor * gate = ggml_sigmoid(ctx0, g2);
             ggml_tensor * gated = ggml_mul(ctx0, normed, gate);
             
@@ -290,11 +314,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
             gated = ggml_cont_2d(ctx0, gated, d_inner, n_tokens);
             cur = ggml_mul_mat(ctx0, layer.wo, gated);
             cb(cur, "kda_out", il);
-            
-            
-            GGML_UNUSED(d_conv);
-            GGML_UNUSED(kq_scale_kda);
-            
+
         } else if (is_mla) {
             // === MLA Layer (Multi-head Latent Attention) without KV Cache ===
             // Reference: vLLM mla.py
@@ -308,25 +328,25 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
             cb(Qcur, "mla_Q", il);
             
             // Step 2: KV compression
-            // kv_lora = kv_a_proj_with_mqa(hidden_states) -> [kv_lora_rank + qk_rope_head_dim, n_tokens]
-            ggml_tensor * kv_lora = ggml_mul_mat(ctx0, layer.wkv_a_mqa, cur);
+            // kv_cmpr_pe = kv_a_proj_with_mqa(hidden_states) -> [kv_lora_rank + qk_rope_head_dim, n_tokens]
+            ggml_tensor * kv_cmpr_pe = ggml_mul_mat(ctx0, layer.wkv_a_mqa, cur);
             
-            // Split: kv_c = kv_lora[:kv_lora_rank], k_pe = kv_lora[kv_lora_rank:]
-            ggml_tensor * kv_c = ggml_view_2d(ctx0, kv_lora, kv_lora_rank, n_tokens,
-                ggml_row_size(kv_lora->type, kv_lora_rank + n_embd_head_qk_rope), 0);
-            ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_lora, n_embd_head_qk_rope, 1, n_tokens,
-                ggml_row_size(kv_lora->type, kv_lora_rank + n_embd_head_qk_rope),
-                ggml_row_size(kv_lora->type, kv_lora_rank + n_embd_head_qk_rope),
-                ggml_row_size(kv_lora->type, kv_lora_rank));
+            // Split: kv_cmpr = kv_lora[:kv_lora_rank], k_pe = kv_lora[kv_lora_rank:]
+            ggml_tensor * kv_cmpr = ggml_view_2d(ctx0, kv_cmpr_pe, kv_lora_rank, n_tokens,
+                ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope), 0);
+            ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_cmpr_pe, n_embd_head_qk_rope, 1, n_tokens,
+                ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
+                ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
+                ggml_row_size(kv_cmpr_pe->type, kv_lora_rank));
             
             // Note: Kimi MLA does NOT apply RoPE (rotary_emb=None in vLLM)
             // k_pe is used directly without RoPE
             
             // Normalize kv_c
-            kv_c = build_norm(kv_c, layer.attn_kv_a_norm, nullptr, LLM_NORM_RMS, il);
+            kv_cmpr = build_norm(kv_cmpr, layer.attn_kv_a_norm, nullptr, LLM_NORM_RMS, il);
             
             // KV decompression: kv = kv_b_proj(kv_c_normed)
-            ggml_tensor * kv = ggml_mul_mat(ctx0, layer.wkv_b, kv_c);
+            ggml_tensor * kv = ggml_mul_mat(ctx0, layer.wkv_b, kv_cmpr);
             const int64_t kv_per_head = n_embd_head_qk_nope + n_embd_head_v_mla;
             
             // Split kv into k_nope and v
@@ -344,17 +364,16 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
             // Concatenate k_nope + k_pe (broadcast k_pe to all heads)
             // K = [k_nope, k_pe] where k_nope is [qk_nope_head_dim, n_head, n_tokens]
             // and k_pe is [qk_rope_head_dim, 1, n_tokens] broadcast to all heads
-            k_pe = ggml_cont(ctx0, k_pe);
             // Need to broadcast k_pe from [qk_rope, 1, n_tokens] to [qk_rope, n_head, n_tokens]
             ggml_tensor * k_pe_target = ggml_new_tensor_3d(ctx0, k_pe->type, n_embd_head_qk_rope, n_head, n_tokens);
             ggml_tensor * k_pe_repeated = ggml_repeat(ctx0, k_pe, k_pe_target);
             ggml_tensor * Kcur = ggml_concat(ctx0, k_nope, k_pe_repeated, 0);
             cb(Kcur, "mla_K", il);
             
-            // Direct softmax attention (without KV cache)
-            // Use build_attn with inp_no_cache for proper mask handling
-            cur = build_attn(inp_no_cache, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il);
-//            cb(cur, "mla_out", il);
+            // Direct softmax attention (with KV cache)
+            // Use build_attn with inp_attn for proper mask handling
+            cur = build_attn(inp_attn, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il);
+            cb(cur, "mla_out", il);
             
         } else {
             // Unknown layer type - this should not happen
@@ -435,6 +454,352 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
     res->t_logits = cur;
 
     ggml_build_forward_expand(gf, cur);
-    
-    GGML_UNUSED(n_embd_head_qk_nope);
 }
+
+/*
+    IMPORTANT: Currently build_kda_chunking is not implemented nor called
+*/
+ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
+        ggml_tensor * q,
+        ggml_tensor * k,
+        ggml_tensor * v,
+        ggml_tensor * gk,
+        ggml_tensor * beta,
+        ggml_tensor * state,
+        ggml_tensor * causal_mask,
+        ggml_tensor * identity,
+        int           il) {
+    GGML_ASSERT(ggml_is_contiguous(q));
+    GGML_ASSERT(ggml_is_contiguous(k));
+    GGML_ASSERT(ggml_is_contiguous(v));
+    GGML_ASSERT(ggml_is_contiguous(gk));
+    GGML_ASSERT(ggml_is_contiguous(beta));
+    GGML_ASSERT(ggml_is_contiguous(state));
+
+    const int64_t S_k      = q->ne[0];
+    const int64_t H_k      = q->ne[1];
+    const int64_t n_tokens = q->ne[2];
+    const int64_t n_seqs   = q->ne[3];
+
+    const int64_t S_v = v->ne[0];
+    const int64_t H_v = v->ne[1];
+
+    GGML_ASSERT(v->ne[2] == n_tokens);
+    GGML_ASSERT(k->ne[2] == n_tokens);
+    GGML_ASSERT(gk->ne[0] == S_v && gk->ne[1] == H_v && gk->ne[2] == n_tokens && gk->ne[3] == n_seqs);
+    GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
+    GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v * H_v && state->ne[2] == 1 && state->ne[3] == n_seqs);
+
+    GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
+    GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
+
+    GGML_ASSERT(H_k == H_v);  // we did a repeat to make sure this is the case
+
+    // TODO: can this ever be false?
+    const bool use_qk_l2norm = true;
+
+    if (use_qk_l2norm) {
+        const float eps_norm = hparams.f_norm_rms_eps;
+
+        q = ggml_l2_norm(ctx0, q, eps_norm);
+        k = ggml_l2_norm(ctx0, k, eps_norm);
+    }
+
+    const float scale = 1.0f / sqrtf(S_v);
+
+    q = ggml_scale(ctx0, q, scale);
+
+    beta = ggml_sigmoid(ctx0, beta);
+
+    cb(q, "q_in", il);
+    cb(k, "k_in", il);
+    cb(v, "v_in", il);
+    cb(beta, "beta_in", il);
+    cb(gk, "gk_in", il);
+
+    q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
+    k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
+    v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
+    gk = ggml_cont_4d(ctx0, ggml_permute(ctx0, gk, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
+
+    beta  = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3));
+    state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
+
+    ggml_tensor * causal_diag_mask = ggml_add(ctx0, causal_mask, identity);
+
+    cb(q, "q_perm", il);
+    cb(k, "k_perm", il);
+    cb(v, "v_perm", il);
+    cb(beta, "beta_perm", il);
+    cb(gk, "gk_perm", il);
+    cb(state, "state_in", il);
+    cb(causal_diag_mask, "causal_diag_mask", il);
+
+    GGML_ASSERT(q->ne[1] == n_tokens && q->ne[0] == S_k && q->ne[2] == H_k && q->ne[3] == n_seqs);
+    GGML_ASSERT(k->ne[1] == n_tokens && k->ne[0] == S_k && k->ne[2] == H_k && k->ne[3] == n_seqs);
+    GGML_ASSERT(v->ne[1] == n_tokens && v->ne[0] == S_v && v->ne[2] == H_k && v->ne[3] == n_seqs);
+    GGML_ASSERT(beta->ne[1] == n_tokens && beta->ne[2] == H_k && beta->ne[0] == 1 && beta->ne[3] == n_seqs);
+
+    ggml_tensor * v_beta = ggml_mul(ctx0, v, beta);
+    ggml_tensor * k_beta = ggml_mul(ctx0, k, beta);
+
+    cb(k_beta, "k_beta", il);
+    cb(v_beta, "v_beta", il);
+
+    return nullptr;
+}
+
+ggml_tensor * llm_build_kimi_linear::build_kda_recurrent(
+    ggml_tensor * q,
+    ggml_tensor * k,
+    ggml_tensor * v,
+    ggml_tensor * gk,
+    ggml_tensor * beta,
+    ggml_tensor * state,
+    ggml_tensor * causal_mask,
+    ggml_tensor * identity,
+    int il) {
+    GGML_ASSERT(ggml_is_contiguous(q));
+    GGML_ASSERT(ggml_is_contiguous(k));
+    GGML_ASSERT(ggml_is_contiguous(v));
+    GGML_ASSERT(ggml_is_contiguous(gk));
+    GGML_ASSERT(ggml_is_contiguous(beta));
+    GGML_ASSERT(ggml_is_contiguous(state));
+    
+    const int64_t S_k      = q->ne[0];
+    const int64_t H_k      = q->ne[1];
+    const int64_t n_tokens = q->ne[2];
+    const int64_t n_seqs   = q->ne[3];
+    
+    const int64_t S_v = v->ne[0];
+    const int64_t H_v = v->ne[1];
+    
+    GGML_ASSERT(v->ne[2] == n_tokens);
+    GGML_ASSERT(k->ne[2] == n_tokens);
+    GGML_ASSERT(gk->ne[0] == S_k && gk->ne[1] == H_v && gk->ne[2] == n_tokens && gk->ne[3] == n_seqs);
+    GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
+    GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v && state->ne[2] == H_v && state->ne[3] == n_seqs);
+    
+    GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
+    GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
+    
+    GGML_ASSERT(H_k == H_v);  // we did a repeat to make sure this is the case
+    
+    // TODO: can this ever be false?
+    const bool use_qk_l2norm = true;
+    
+    if (use_qk_l2norm) {
+        const float eps_norm = hparams.f_norm_rms_eps;
+    
+        q = ggml_l2_norm(ctx0, q, eps_norm);
+        k = ggml_l2_norm(ctx0, k, eps_norm);
+    }
+    
+    const float scale = 1.0f / sqrtf(S_v);
+    
+    beta = ggml_sigmoid(ctx0, beta);
+
+    ggml_tensor * causal_diag_mask = ggml_add(ctx0, causal_mask, identity);
+
+    cb(q, "q_in", il);
+    cb(k, "k_in", il);
+    cb(v, "v_in", il);
+    cb(beta, "beta_in", il);
+    cb(gk, "gk_in", il);
+    
+    q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
+    k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
+    v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
+    gk = ggml_cont_4d(ctx0, ggml_permute(ctx0, gk, 1, 2, 0, 3), n_tokens, S_k, H_k, n_seqs);
+    
+    beta  = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3));
+    state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
+    
+    cb(q, "q_perm", il);
+    cb(k, "k_perm", il);
+    cb(v, "v_perm", il);
+    cb(beta, "beta_perm", il);
+    cb(gk, "gk_perm", il);
+    cb(state, "state_in", il);
+    
+    GGML_ASSERT(q->ne[1] == n_tokens && q->ne[0] == S_k && q->ne[2] == H_k && q->ne[3] == n_seqs);
+    GGML_ASSERT(k->ne[1] == n_tokens && k->ne[0] == S_k && k->ne[2] == H_k && k->ne[3] == n_seqs);
+    GGML_ASSERT(v->ne[1] == n_tokens && v->ne[0] == S_v && v->ne[2] == H_k && v->ne[3] == n_seqs);
+    GGML_ASSERT(beta->ne[1] == n_tokens && beta->ne[2] == H_k && beta->ne[0] == 1 && beta->ne[3] == n_seqs);
+
+    // =========================================================================
+    // Compute cumulative sum of gk per key dimension
+    // gk_cumsum: [S_k, n_tokens, H_k, n_seqs] - cumsum along dim 1 (tokens)
+    // =========================================================================
+    ggml_tensor * gk_cumsum = ggml_cumsum(ctx0, gk);
+    cb(gk_cumsum, "gk_cumsum", il);
+
+    // Scale k and k_beta
+    
+    ggml_tensor * k_beta = ggml_mul(ctx0, k, beta);
+    ggml_tensor * v_beta = ggml_mul(ctx0, v, beta);
+    
+    cb(k_beta, "k_beta", il);
+    cb(v_beta, "v_beta", il);
+
+
+/*
+    https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/kda/naive.py
+
+    for i in range(T):
+        k_i = k[..., i, :]
+        g_i = g[..., i:i+1, :]
+        A[..., i] = torch.einsum('... c d, ... d -> ... c', k * (g - g_i).exp(), k_i)
+*/
+    const int64_t HB = H_k * n_seqs;
+    ggml_tensor * k_per = ggml_cont(ctx0, ggml_permute(ctx0, k, 1, 0, 2, 3));
+    ggml_tensor * k_i = ggml_reshape_4d(ctx0, k_per, n_tokens, 1, S_k, HB);
+    ggml_tensor * k_i_bc = ggml_repeat_4d(ctx0, k_i, n_tokens, n_tokens, S_k, HB);
+    ggml_tensor * g_i = ggml_reshape_4d(ctx0, gk_cumsum, n_tokens, 1, S_k, HB);
+    ggml_tensor * g_i_bc = ggml_repeat_4d(ctx0, g_i, n_tokens, n_tokens, S_k, HB);  // [S_k, chunk_size, 1, HB] -> [S_k, chunk_size, chunk_size, HB]
+    
+    ggml_tensor * k_j = ggml_reshape_4d(ctx0, k_per, 1, n_tokens, S_k, HB);
+    ggml_tensor * k_j_bc = ggml_repeat_4d(ctx0, k_j, n_tokens, n_tokens, S_k, HB);
+    
+    ggml_tensor * g_j = ggml_reshape_4d(ctx0, gk_cumsum, 1, n_tokens, S_k, HB);
+    ggml_tensor * g_j_bc = ggml_repeat_4d(ctx0, g_j, n_tokens, n_tokens, S_k, HB);  // [S_k, 1, chunk_size, HB] -> [S_k, chunk_size, chunk_size, HB]
+    
+    ggml_tensor * decay_mask = ggml_sub(ctx0, g_j_bc, g_i_bc);
+    cb(decay_mask, "decay_mask", il);
+    decay_mask = ggml_mul(ctx0, decay_mask, causal_diag_mask);
+    decay_mask = ggml_exp(ctx0, decay_mask);
+    decay_mask = ggml_mul(ctx0, decay_mask, causal_diag_mask);
+    cb(decay_mask, "decay_mask_exp", il);
+    
+    ggml_tensor * Akk = ggml_mul(ctx0, decay_mask, k_j_bc);
+    Akk = ggml_mul(ctx0, Akk, k_i_bc);
+    
+    Akk = ggml_cont(ctx0, ggml_permute(ctx0, Akk, 1, 2, 0, 3));
+    Akk = ggml_sum_rows(ctx0, Akk);
+    
+    Akk = ggml_reshape_4d(ctx0, Akk, n_tokens, n_tokens, H_k, n_seqs);
+
+    Akk = ggml_mul(ctx0, Akk, beta);
+    Akk = ggml_neg(ctx0, ggml_mul(ctx0, Akk, causal_mask));
+
+    cb(Akk, "attn_pre_rec", il);
+    
+    // for i in range(1, chunk_size):
+    //          row = attn[..., i, :i].clone()
+    //          sub = attn[..., :i, :i].clone()
+    //          attn[..., i, :i] = row + (row.unsqueeze(-1) * sub).sum(-2)
+    // attn = attn + torch.eye(chunk_size, dtype=attn.dtype, device=attn.device)
+    //
+    // We reduce this to a linear triangular solve: AX = B, where B = attn, A = I - tril(A)
+    ggml_tensor * attn_lower = ggml_mul(ctx0, Akk, causal_mask);
+    ggml_tensor * lhs        = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower);
+    
+    ggml_tensor * lin_solve  = ggml_solve_tri(ctx0, lhs, Akk, true, true, false);
+    Akk                      = ggml_mul(ctx0, lin_solve, causal_mask);
+    Akk                      = ggml_add(ctx0, Akk, identity);
+
+    gk_cumsum = ggml_cont(ctx0, ggml_permute(ctx0, gk_cumsum, 1, 0, 2, 3));  // back to [S_k, n_tokens, H_k, n_seqs]
+
+    // u = (A*beta[..., None, :]) @ v  aka U_[t]
+    ggml_tensor * vb = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), Akk);
+    cb(vb, "value_beta", il);
+    
+    // k_cumdecay = attn @ (k_beta * g.exp().unsqueeze(-1)) or W_[t]
+    ggml_tensor * gkexp      = ggml_exp(ctx0, gk_cumsum); // [S,T,H,B]
+
+    ggml_tensor * kbeta_gkexp = ggml_mul(ctx0, k_beta, gkexp);
+    cb(kbeta_gkexp, "kbeta_gkexp", il);
+
+    ggml_tensor * k_cumdecay = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gkexp)), Akk);
+    cb(k_cumdecay, "k_cumdecay", il);
+
+/*
+    https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/kda/naive.py
+
+        for j in range(BT):
+            k_j = k[:, :, i, j]
+            g_j = g[:, :, i, j:j+1, :]
+            A[..., j] = torch.einsum('... c d, ... d -> ... c', q_i * (g_i - g_j).exp(), k_j)
+*/
+    ggml_tensor * q_per = ggml_cont(ctx0, ggml_permute(ctx0, q, 1, 0, 2, 3));
+    ggml_tensor * q_j = ggml_reshape_4d(ctx0, q_per, 1, n_tokens, S_k, HB);
+    ggml_tensor * q_j_bc = ggml_repeat_4d(ctx0, q_j, n_tokens, n_tokens, S_k, HB);
+    ggml_tensor * kq = ggml_mul(ctx0, decay_mask, q_j_bc);
+    kq = ggml_mul(ctx0, kq, k_i_bc);
+    kq = ggml_cont(ctx0, ggml_permute(ctx0, kq, 1, 2, 0, 3));
+    
+    ggml_tensor * Aqk = ggml_sum_rows(ctx0, kq);
+    Aqk = ggml_cont(ctx0, ggml_reshape_4d(ctx0, Aqk, n_tokens, n_tokens, H_k, n_seqs));
+    Aqk = ggml_mul(ctx0, Aqk, ggml_add(ctx0, identity, causal_mask));
+    Aqk = ggml_scale(ctx0, Aqk, scale); // scale q
+    cb(Aqk, "attn_decay_key", il);
+    
+    ggml_tensor * state_t = ggml_cont(ctx0, ggml_transpose(ctx0, state));
+    
+    // v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state or W_[t] @ S_[t]
+    ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay);
+
+    cb(v_prime, "v_prime", il);
+
+    // v_new = v_i - v_prime or U_[t] - W_[t]*S_[t]
+    ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, vb, v_prime), v_prime);
+
+    // v_new_t [T.S.H,B]
+    ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new));
+
+    cb(v_new, "v_new", il);
+
+    // attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
+    // or Gamma_[t]*Q_]t] @ S
+    ggml_tensor * q_gk_exp    = ggml_mul(ctx0, q, gkexp);
+    ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_gk_exp);
+    // scale q at attn_inter as suggested in chunk_gla_fwd_kernel_o of
+    // github.com/fla-org/flash-linear-attention/fla/ops/gla/chunk.py
+    attn_inter = ggml_scale(ctx0, attn_inter, scale); // scale q
+
+    cb(attn_inter, "attn_inter", il);
+
+    // core_attn_out[:, :, i] = attn_inter + attn @ v_new or A' @ (U_[t] - W_[t]*S_[t])
+    ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, Aqk);
+
+    cb(v_attn, "v_attn", il);
+
+    // o[:, :, i] = (q_i * g_i.exp()) @ S + A @ v_i
+    ggml_tensor * core_attn_out = ggml_add(ctx0, attn_inter, v_attn);
+
+    cb(core_attn_out, "core_attn_out", il);
+
+    ggml_tensor * gk_cum_last =
+        ggml_cont(ctx0, ggml_view_4d(ctx0, gk_cumsum, gk_cumsum->ne[0], 1, gk_cumsum->ne[2], gk_cumsum->ne[3],
+                                    gk_cumsum->nb[1], gk_cumsum->nb[2], gk_cumsum->nb[3],
+                                    gk_cumsum->nb[1] * (gk_cumsum->ne[1] - 1)));
+    cb(gk_cum_last, "gk_cum_last", il);
+
+    ggml_tensor * gkexp_last = ggml_exp(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, gk_cum_last)));
+    cb(gkexp_last, "gkexp_last", il);
+
+    ggml_tensor * gk_diff = ggml_neg(ctx0, ggml_sub(ctx0, gk_cumsum, gk_cum_last));
+    cb(gk_diff, "gk_diff", il);
+
+    ggml_tensor * gk_diff_exp = ggml_exp(ctx0, gk_diff);
+    cb(gk_diff_exp, "gk_diff_exp", il);
+
+    ggml_tensor * key_gkdiff = ggml_mul(ctx0, k, gk_diff_exp);
+    cb(key_gkdiff, "key_gkdiff", il);
+
+    // rearrange((g_i[:,:,-1:] - g_i).exp()*k_i, 'b h c k -> b h k c') @ (U_[t] - W_[t] @ S)
+    ggml_tensor * kgkdmulvnew = ggml_mul_mat(ctx0, v_new_t, ggml_cont(ctx0, ggml_transpose(ctx0, key_gkdiff)));
+    cb(kgkdmulvnew, "kgkdmulvnew", il);
+
+    state = ggml_add(ctx0, ggml_mul(ctx0, state, gkexp_last), kgkdmulvnew);
+    cb(state, "new_state", il);
+
+    // flatten output
+    ggml_tensor * flat_output =
+        ggml_cont_1d(ctx0, ggml_permute(ctx0, core_attn_out, 0, 2, 1, 3), S_v * H_v * n_tokens * n_seqs);
+
+    ggml_tensor * flat_state = ggml_cont_1d(ctx0, state, S_v * S_v * H_v * n_seqs);
+
+    return ggml_concat(ctx0, flat_output, flat_state, 0);
+}
+
diff --git a/src/models/models.h b/src/models/models.h
index 8b7af8d7bc..bec9d39391 100644
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -287,6 +287,27 @@ struct llm_build_kimi_linear : public llm_graph_context_mamba {
     llm_build_kimi_linear(const llama_model & model, const llm_graph_params & params);
 private:
     const llama_model & model;
+    ggml_tensor * build_kda_recurrent(
+                ggml_tensor * q,
+                ggml_tensor * k,
+                ggml_tensor * v,
+                ggml_tensor * g,
+                ggml_tensor * beta,
+                ggml_tensor * state,
+                ggml_tensor * causal_mask,
+                ggml_tensor * identity,
+                        int   il);
+
+    ggml_tensor * build_kda_chunking(
+                ggml_tensor * q,
+                ggml_tensor * k,
+                ggml_tensor * v,
+                ggml_tensor * g,
+                ggml_tensor * beta,
+                ggml_tensor * state,
+                ggml_tensor * causal_mask,
+                ggml_tensor * identity,
+                        int   il);
 };
 
 struct llm_build_lfm2 : public llm_graph_context {

From aba181ebadd1c860201eff4fe702a89b3c9b8a1c Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Mon, 5 Jan 2026 19:21:06 +0800
Subject: [PATCH 23/58] removed LOG_INFO

---
 src/models/kimi-linear.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp
index 32a723b80a..a943dd1dce 100644
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@@ -1,6 +1,5 @@
 #include "models.h"
 #include "ggml.h"
-#include "llama-impl.h"
 
 #define CHUNK_SIZE 64
 
@@ -58,8 +57,6 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
         const auto & layer = model.layers[il];
         ggml_tensor * inpSA = inpL;
 
-        if (!layer.attn_norm)
-            LLAMA_LOG_INFO("Empty attn_norm at layer %d\n", il);
         // Attention Norm
         cur = build_norm(inpL, layer.attn_norm, NULL, LLM_NORM_RMS, il);
         cb(cur, "attn_norm", il);

From cfed14e31bd02c4c4dc971f9c900b9c2d39ca6fe Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Tue, 6 Jan 2026 11:23:53 +0800
Subject: [PATCH 24/58] naive chunking form implemented

---
 src/models/kimi-linear.cpp | 223 +++++++++++++++++++++++++++++++++++--
 1 file changed, 214 insertions(+), 9 deletions(-)

diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp
index a943dd1dce..3fb40471a1 100644
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@@ -265,7 +265,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
             // Choose between build_kda_chunking and build_kda_recurrent based on n_tokens
             // TODO: Currently only build_kda_recurrent is implemented
             ggml_tensor * attn_out = n_seq_tokens > CHUNK_SIZE ?
-                build_kda_recurrent(Qcur, Kcur, Vcur, g1, beta, state, causal_mask, identity, il) : 
+                build_kda_chunking(Qcur, Kcur, Vcur, g1, beta, state, causal_mask, identity, il) : 
                 build_kda_recurrent(Qcur, Kcur, Vcur, g1, beta, state, causal_mask, identity, il);
             cb(attn_out, "attn_out", il);
 
@@ -485,7 +485,7 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
     GGML_ASSERT(k->ne[2] == n_tokens);
     GGML_ASSERT(gk->ne[0] == S_v && gk->ne[1] == H_v && gk->ne[2] == n_tokens && gk->ne[3] == n_seqs);
     GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
-    GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v * H_v && state->ne[2] == 1 && state->ne[3] == n_seqs);
+    GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v && state->ne[2] == H_v && state->ne[3] == n_seqs);
 
     GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
     GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
@@ -504,8 +504,6 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
 
     const float scale = 1.0f / sqrtf(S_v);
 
-    q = ggml_scale(ctx0, q, scale);
-
     beta = ggml_sigmoid(ctx0, beta);
 
     cb(q, "q_in", il);
@@ -514,8 +512,8 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
     cb(beta, "beta_in", il);
     cb(gk, "gk_in", il);
 
-    q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
-    k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
+    q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_k, n_tokens, H_k, n_seqs);
+    k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_k, n_tokens, H_k, n_seqs);
     v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
     gk = ggml_cont_4d(ctx0, ggml_permute(ctx0, gk, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
 
@@ -530,20 +528,227 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
     cb(beta, "beta_perm", il);
     cb(gk, "gk_perm", il);
     cb(state, "state_in", il);
-    cb(causal_diag_mask, "causal_diag_mask", il);
 
     GGML_ASSERT(q->ne[1] == n_tokens && q->ne[0] == S_k && q->ne[2] == H_k && q->ne[3] == n_seqs);
     GGML_ASSERT(k->ne[1] == n_tokens && k->ne[0] == S_k && k->ne[2] == H_k && k->ne[3] == n_seqs);
     GGML_ASSERT(v->ne[1] == n_tokens && v->ne[0] == S_v && v->ne[2] == H_k && v->ne[3] == n_seqs);
     GGML_ASSERT(beta->ne[1] == n_tokens && beta->ne[2] == H_k && beta->ne[0] == 1 && beta->ne[3] == n_seqs);
 
+    // Do padding
+    const int64_t chunk_size = CHUNK_SIZE;
+
+    const int64_t pad = (chunk_size - n_tokens % chunk_size) % chunk_size;
+    const int64_t n_chunks = (n_tokens + pad) / chunk_size;
+
+    q = ggml_pad(ctx0, q, 0, pad, 0, 0);
+    k = ggml_pad(ctx0, k, 0, pad, 0, 0);
+    v = ggml_pad(ctx0, v, 0, pad, 0, 0);
+    gk = ggml_pad(ctx0, gk, 0, pad, 0, 0);
+    beta = ggml_pad(ctx0, beta, 0, pad, 0, 0);
+
+    cb(q, "q_pad", il);
+    cb(k, "k_pad", il);
+    cb(v, "v_pad", il);
+    cb(beta, "beta_pad", il);
+    cb(gk, "gk_pad", il);
+
     ggml_tensor * v_beta = ggml_mul(ctx0, v, beta);
     ggml_tensor * k_beta = ggml_mul(ctx0, k, beta);
 
-    cb(k_beta, "k_beta", il);
     cb(v_beta, "v_beta", il);
+    cb(k_beta, "k_beta", il);
 
-    return nullptr;
+    ggml_tensor * chunked_mask =
+        ggml_view_4d(ctx0, causal_mask, chunk_size,
+                chunk_size,         causal_mask->ne[2], causal_mask->ne[3],
+                causal_mask->nb[1], causal_mask->nb[2], causal_mask->nb[3], 0);
+
+    ggml_tensor * chunked_diag_mask =
+        ggml_view_4d(ctx0, causal_diag_mask, chunk_size,
+                chunk_size,              causal_diag_mask->ne[2], causal_diag_mask->ne[3],
+                causal_diag_mask->nb[1], causal_diag_mask->nb[2], causal_diag_mask->nb[3], 0);
+
+    ggml_tensor * chunked_identity =
+        ggml_view_4d(ctx0, identity, chunk_size,
+            chunk_size,      identity->ne[2], identity->ne[3],
+            identity->nb[1], identity->nb[2], identity->nb[3], 0);
+
+    const int64_t HB = H_k * n_seqs;
+
+    q      = ggml_cont_4d(ctx0, q,      S_k, chunk_size, n_chunks, HB);
+    k      = ggml_cont_4d(ctx0, k,      S_k, chunk_size, n_chunks, HB);
+    k_beta = ggml_cont_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, HB);
+    v      = ggml_cont_4d(ctx0, v,      S_v, chunk_size, n_chunks, HB);
+    v_beta = ggml_cont_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, HB);
+
+    gk    = ggml_cont_4d(ctx0, gk, S_k, chunk_size, n_chunks, HB);
+    beta = ggml_cont_4d(ctx0, beta, 1, chunk_size, n_chunks, HB);
+
+    // switch for cumsum
+    gk = ggml_cont_4d(ctx0, ggml_permute(ctx0, gk, 1, 0, 2, 3), chunk_size, S_k, n_chunks, HB);
+    ggml_tensor * gk_cumsum = ggml_cumsum(ctx0, gk);
+    cb(gk_cumsum, "gk_cumsum", il);
+
+    const int64_t CHB = n_chunks * H_v * n_seqs;
+
+    ggml_tensor * g_i = ggml_reshape_4d(ctx0, gk_cumsum, chunk_size, 1, S_k, CHB);
+    ggml_tensor * g_j = ggml_reshape_4d(ctx0, gk_cumsum, 1, chunk_size, S_k, CHB);
+
+    ggml_tensor * g_j_bc = ggml_repeat_4d(ctx0, g_j, chunk_size, chunk_size, S_k, CHB);
+
+    ggml_tensor * decay_mask = ggml_sub(ctx0, g_j_bc, g_i);
+
+    cb(decay_mask, "decay_mask", il);
+
+    decay_mask = ggml_mul(ctx0, decay_mask, chunked_diag_mask);
+    decay_mask = ggml_exp(ctx0, decay_mask);
+    decay_mask = ggml_mul(ctx0, decay_mask, chunked_diag_mask);
+    cb(decay_mask, "decay_mask_exp", il);
+
+// k [S,BT,NT,H*B] k_per [BT,S,NT,H*B]
+    ggml_tensor * k_per = ggml_cont(ctx0, ggml_permute(ctx0, k, 1, 0, 2, 3));
+    ggml_tensor * k_i = ggml_reshape_4d(ctx0, k_per, chunk_size, 1, S_k, CHB);
+    ggml_tensor * k_i_bc = ggml_repeat_4d(ctx0, k_i, chunk_size, chunk_size, S_k, CHB);
+    ggml_tensor * k_j = ggml_reshape_4d(ctx0, k_per, 1,  chunk_size, S_k, CHB);
+    ggml_tensor * k_j_bc = ggml_repeat_4d(ctx0, k_j, chunk_size, chunk_size, S_k, CHB);
+
+    ggml_tensor * Akk = ggml_mul(ctx0, decay_mask, k_j_bc);
+    Akk = ggml_mul(ctx0, Akk, k_i_bc);
+
+    Akk = ggml_cont(ctx0, ggml_permute(ctx0, Akk, 1, 2, 0, 3));
+    Akk = ggml_sum_rows(ctx0, Akk);
+
+    Akk = ggml_reshape_4d(ctx0, Akk, chunk_size, chunk_size, n_chunks, H_k * n_seqs);
+
+    Akk = ggml_mul(ctx0, Akk, beta);
+    Akk = ggml_neg(ctx0, ggml_mul(ctx0, Akk, chunked_mask));
+
+    cb(Akk, "attn_pre_solve", il);
+
+    ggml_tensor * attn_lower = ggml_mul(ctx0, Akk, chunked_mask);
+    ggml_tensor * lhs        = ggml_sub(ctx0, ggml_repeat(ctx0, chunked_identity, attn_lower), attn_lower);
+
+    ggml_tensor * lin_solve  = ggml_solve_tri(ctx0, lhs, Akk, true, true, false);
+    Akk                      = ggml_mul(ctx0, lin_solve, chunked_mask);
+    Akk                      = ggml_add(ctx0, Akk, chunked_identity);
+
+    cb(Akk, "attn_solved", il);
+
+    ggml_tensor * vb = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), Akk);
+
+    gk_cumsum = ggml_cont_4d(ctx0, ggml_permute(ctx0, gk_cumsum, 1, 0, 2, 3), S_k, chunk_size, n_chunks, HB);
+    ggml_tensor * gkexp      = ggml_exp(ctx0, gk_cumsum);
+
+    ggml_tensor * kbeta_gkexp = ggml_mul(ctx0, k_beta, gkexp);
+    cb(kbeta_gkexp, "kbeta_gkexp", il);
+
+    ggml_tensor * k_cumdecay = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gkexp)), Akk);
+    cb(k_cumdecay, "k_cumdecay", il);
+
+    ggml_tensor * core_attn_out = nullptr;
+    ggml_tensor * new_state = ggml_dup(ctx0, state);
+
+    cb(new_state, "new_state", il);
+
+    for (int64_t chunk = 0; chunk < n_chunks; chunk++) {
+//    for (int64_t chunk = 0; chunk < 1; chunk++) {
+// extract one chunk worth of data
+        auto chunkify = [=](ggml_tensor * t) {
+                    return ggml_cont(ctx0, ggml_view_4d(ctx0, t, t->ne[0], chunk_size, 1, t->ne[3],
+                t->nb[1], t->nb[2], t->nb[3], t->nb[2] * chunk));
+        };
+
+// k [S,BT,NT,H*B] => k_chunk [S,BT,1,H*B]
+        ggml_tensor * k_chunk = chunkify(k);
+        ggml_tensor * q_chunk = chunkify(q);
+        ggml_tensor * vb_chunk = chunkify(vb);
+
+        // Since decay_mask now has dimension of [BT,BT,S,NT*H*B], it can't be chunkified
+        // decay_mask_chunk needs to be recomputed
+// gk_cumsum [S,BT,NT,H*B] => gk_cs_chunk [S,BT,1,H*B]
+        ggml_tensor * gk_cs_chunk = chunkify(gk_cumsum);
+        ggml_tensor * gk_cs_chunk_i = ggml_cont(ctx0, ggml_permute(ctx0, gk_cs_chunk, 2, 0, 1, 3));
+        ggml_tensor * gk_cs_chunk_j = ggml_cont(ctx0, ggml_permute(ctx0, gk_cs_chunk, 2, 1, 0, 3));
+
+        ggml_tensor * gk_cs_chunk_j_bc = ggml_repeat_4d(ctx0, gk_cs_chunk_j, chunk_size, chunk_size, S_k, HB);
+        ggml_tensor * decay_mask_chunk = ggml_sub(ctx0, gk_cs_chunk_j_bc, gk_cs_chunk_i);
+        cb(decay_mask_chunk, "decay_mask_chunk", il);
+        decay_mask_chunk = ggml_mul(ctx0, decay_mask_chunk, chunked_diag_mask);
+        decay_mask_chunk = ggml_exp(ctx0, decay_mask_chunk);
+        decay_mask_chunk = ggml_mul(ctx0, decay_mask_chunk, chunked_diag_mask);
+        cb(decay_mask_chunk, "decay_mask_chunk_exp", il);
+
+        ggml_tensor * k_cumdecay_chunk = chunkify(k_cumdecay);
+
+        ggml_tensor * gkexp_chunk = ggml_exp(ctx0, gk_cs_chunk);
+
+        ggml_tensor * k_chunk_i = ggml_cont(ctx0, ggml_permute(ctx0, k_chunk, 2, 0, 1, 3));
+        ggml_tensor * k_chunk_i_bc = ggml_repeat_4d(ctx0, k_chunk_i, chunk_size, chunk_size, S_k, HB);
+        ggml_tensor * q_chunk_j = ggml_cont(ctx0, ggml_permute(ctx0, q_chunk, 2, 1, 0, 3));
+        ggml_tensor * q_chunk_j_bc = ggml_repeat_4d(ctx0, q_chunk_j, chunk_size, chunk_size, S_k, HB);
+        ggml_tensor * kq = ggml_mul(ctx0, decay_mask_chunk, q_chunk_j_bc);
+        kq = ggml_mul(ctx0, kq, k_chunk_i_bc);
+
+        ggml_tensor * Aqk = ggml_mul(ctx0, kq, decay_mask_chunk);
+        Aqk = ggml_mul(ctx0, Aqk, ggml_add(ctx0, chunked_identity, chunked_mask));
+        Aqk = ggml_cont(ctx0, ggml_permute(ctx0, Aqk, 1, 2, 0, 3));
+        Aqk = ggml_sum_rows(ctx0, Aqk);
+        Aqk = ggml_scale(ctx0, Aqk, scale); // scale q
+        Aqk = ggml_reshape_4d(ctx0, Aqk, chunk_size, chunk_size, 1, HB);
+
+        ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs);
+
+// new_state [S,S,1,H*B] k_cumdecay_chunk [S,BT,1,H*B]
+        ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay_chunk);
+
+        ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, vb_chunk, v_prime), v_prime);
+        ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new));
+
+// q_chunk [S,BT,1,H*B] gkexp_chunk [S,BT,1,H*B]
+        ggml_tensor * q_gk_exp   = ggml_mul(ctx0, q_chunk, gkexp_chunk);
+        ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_gk_exp);
+        attn_inter = ggml_scale(ctx0, attn_inter, scale); // scale q
+
+// v_new_t [S,BT,1,H*B] Aqk [BT,BT,1,H*B]
+        ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, Aqk);
+
+        ggml_tensor * core_attn_out_chunk = ggml_add(ctx0, attn_inter, v_attn);
+
+        core_attn_out = core_attn_out == nullptr ? core_attn_out_chunk : ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 1);
+
+        ggml_tensor * gk_cum_last =
+            ggml_cont(ctx0, ggml_view_4d(ctx0, gk_cs_chunk, gk_cs_chunk->ne[0], 1, gk_cs_chunk->ne[2], gk_cs_chunk->ne[3],
+                                        gk_cs_chunk->nb[1], gk_cs_chunk->nb[2], gk_cs_chunk->nb[3],
+                                        gk_cs_chunk->nb[1] * (gk_cs_chunk->ne[1] - 1)));
+
+        ggml_tensor * gkexp_last = ggml_exp(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, gk_cum_last)));
+
+        ggml_tensor * gk_diff = ggml_neg(ctx0, ggml_sub(ctx0, gk_cs_chunk, gk_cum_last));
+
+        ggml_tensor * gk_diff_exp = ggml_exp(ctx0, gk_diff);
+
+        ggml_tensor * key_gkdiff = ggml_mul(ctx0, k_chunk, gk_diff_exp);
+
+        ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, ggml_cont(ctx0, ggml_transpose(ctx0, key_gkdiff)));
+
+        new_state = ggml_add(ctx0,
+            ggml_mul(ctx0, new_state, ggml_reshape_4d(ctx0, gkexp_last, gkexp_last->ne[0], gkexp_last->ne[1], H_v, n_seqs)),
+            ggml_reshape_4d(ctx0, kgdmulvnew, kgdmulvnew->ne[0], kgdmulvnew->ne[1], H_v, n_seqs));
+    }
+
+    core_attn_out = ggml_cont_4d(ctx0, core_attn_out, S_v, chunk_size * n_chunks, H_v, n_seqs);
+
+    ggml_tensor * output_tokens = ggml_view_4d(ctx0, core_attn_out, S_v, n_tokens, H_v, n_seqs, core_attn_out->nb[1], core_attn_out->nb[2], core_attn_out->nb[3], 0);
+    cb(output_tokens, "output_tokens", il);
+
+    // flatten output
+    ggml_tensor * flat_output =
+        ggml_cont_1d(ctx0, ggml_permute(ctx0, output_tokens, 0, 2, 1, 3), S_v * H_v * n_tokens * n_seqs);
+
+    ggml_tensor * flat_state = ggml_cont_1d(ctx0, new_state, S_v * S_v * H_v * n_seqs);
+    cb(new_state, "output_state", il);
+
+    return ggml_concat(ctx0, flat_output, flat_state, 0);
 }
 
 ggml_tensor * llm_build_kimi_linear::build_kda_recurrent(

From e3542ff8a27384dc6e25d519f0336aedab9a046b Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Tue, 6 Jan 2026 11:35:25 +0800
Subject: [PATCH 25/58] fixed some comments

---
 src/models/kimi-linear.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp
index 3fb40471a1..013926e544 100644
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@@ -263,7 +263,6 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
             ggml_tensor * state = build_rs(inp_rs, ssm_states_all, hparams.n_embd_s(), n_seqs);
             state = ggml_reshape_4d(ctx0, state, head_dim, head_dim, n_head, n_seqs);
             // Choose between build_kda_chunking and build_kda_recurrent based on n_tokens
-            // TODO: Currently only build_kda_recurrent is implemented
             ggml_tensor * attn_out = n_seq_tokens > CHUNK_SIZE ?
                 build_kda_chunking(Qcur, Kcur, Vcur, g1, beta, state, causal_mask, identity, il) : 
                 build_kda_recurrent(Qcur, Kcur, Vcur, g1, beta, state, causal_mask, identity, il);
@@ -315,7 +314,6 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
         } else if (is_mla) {
             // === MLA Layer (Multi-head Latent Attention) without KV Cache ===
             // Reference: vLLM mla.py
-            // TODO: Implement proper KV caching for MLA (requires custom cache format)
             
             // Step 1: Q projection and reshape
             // vLLM Kimi: q = q_proj(hidden_states), then view as [n_tokens, n_head, qk_head_dim]
@@ -454,7 +452,8 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
 }
 
 /*
-    IMPORTANT: Currently build_kda_chunking is not implemented nor called
+    This is a ggml implementation of the naive_chunk_kda function of
+    https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/kda/naive.py
 */
 ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
         ggml_tensor * q,

From 67bee56013ae0c1c68200ddbbea80f91742828b6 Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Tue, 6 Jan 2026 21:15:12 +0800
Subject: [PATCH 26/58] add Kimi-K2 specific tokens to be recognized as EOG

---
 src/llama-vocab.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 7af74b0218..08d559e90f 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -2203,6 +2203,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                         || t.first == "<|end_of_text|>" // granite
                         || t.first == "<EOT>"
                         || t.first == "_<EOT>"
+                        || t.first == "_<EOT>"
+                        || t.first == "[EOT]" // Kimi-K2
                         || t.first == "<｜end▁of▁sentence｜>" // DeepSeek
                         || t.first == "<end_of_utterance>" // smoldocling
                    ) {
@@ -2296,6 +2298,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                         || t.first == "<fim-pad>"
                         || t.first == "<fim_pad>"   // Granite
                         || t.first == "<PAD>"
+                        || t.first == "[PAD]" // Kimi-K2
                         ) {
                     special_fim_pad_id = t.second;
                     if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -2368,6 +2371,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                     || t.first == "<|eom_id|>"
                     || t.first == "<EOT>"
                     || t.first == "_<EOT>"
+                    || t.first == "[EOT]" // Kimi-K2
+                    || t.first == "[EOS]" // Kimi-K2
                     || t.first == "<|end_of_text|>"
                     || t.first == "<end_of_utterance>" // smoldocling
                ) {

From 1099cbf694a8d5d85b6ebd0852c21b53bad2ccce Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Wed, 7 Jan 2026 18:42:31 +0800
Subject: [PATCH 27/58] build_kda_autoregressive is implemented to replace
 build_kda_recurrent for faster inference. sync'd to b7682

---
 src/models/kimi-linear.cpp | 355 +++++++++++--------------------------
 src/models/models.h        |   9 +-
 2 files changed, 110 insertions(+), 254 deletions(-)

diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp
index 013926e544..270f9e6e6b 100644
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@@ -20,14 +20,16 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
     // Output ids for selecting which tokens to output
     ggml_tensor * inp_out_ids = build_inp_out_ids();
 
-    ggml_tensor * causal_mask =
-        ggml_tri(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, ubatch.n_seq_tokens, ubatch.n_seq_tokens), 1.0f),
+    ggml_tensor * chunked_causal_mask =
+        ggml_tri(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, CHUNK_SIZE, CHUNK_SIZE), 1.0f),
                     GGML_TRI_TYPE_LOWER);
 
-    ggml_tensor * identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, ubatch.n_seq_tokens), 1.0f));
+    ggml_tensor * chunked_identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, CHUNK_SIZE), 1.0f));
+    ggml_tensor * chunked_diag_mask = ggml_add(ctx0, chunked_causal_mask, chunked_identity);
 
-    ggml_build_forward_expand(gf, causal_mask);
-    ggml_build_forward_expand(gf, identity);
+    ggml_build_forward_expand(gf, chunked_causal_mask);
+    ggml_build_forward_expand(gf, chunked_identity);
+    ggml_build_forward_expand(gf, chunked_diag_mask);
 
     // Kimi dimension constants
     const int64_t n_head = hparams.n_head();
@@ -263,9 +265,9 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
             ggml_tensor * state = build_rs(inp_rs, ssm_states_all, hparams.n_embd_s(), n_seqs);
             state = ggml_reshape_4d(ctx0, state, head_dim, head_dim, n_head, n_seqs);
             // Choose between build_kda_chunking and build_kda_recurrent based on n_tokens
-            ggml_tensor * attn_out = n_seq_tokens > CHUNK_SIZE ?
-                build_kda_chunking(Qcur, Kcur, Vcur, g1, beta, state, causal_mask, identity, il) : 
-                build_kda_recurrent(Qcur, Kcur, Vcur, g1, beta, state, causal_mask, identity, il);
+            ggml_tensor * attn_out = n_seq_tokens == 1 ?
+                build_kda_autoregressive(Qcur, Kcur, Vcur, g1, beta, state, il) :
+                build_kda_chunking(Qcur, Kcur, Vcur, g1, beta, state, chunked_causal_mask, chunked_identity, chunked_diag_mask, il);
             cb(attn_out, "attn_out", il);
 
            // The tensors were concatenated 1d, so we need to extract them 1d as well
@@ -464,6 +466,7 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
         ggml_tensor * state,
         ggml_tensor * causal_mask,
         ggml_tensor * identity,
+        ggml_tensor * diag_mask,
         int           il) {
     GGML_ASSERT(ggml_is_contiguous(q));
     GGML_ASSERT(ggml_is_contiguous(k));
@@ -519,8 +522,6 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
     beta  = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3));
     state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
 
-    ggml_tensor * causal_diag_mask = ggml_add(ctx0, causal_mask, identity);
-
     cb(q, "q_perm", il);
     cb(k, "k_perm", il);
     cb(v, "v_perm", il);
@@ -557,21 +558,6 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
     cb(v_beta, "v_beta", il);
     cb(k_beta, "k_beta", il);
 
-    ggml_tensor * chunked_mask =
-        ggml_view_4d(ctx0, causal_mask, chunk_size,
-                chunk_size,         causal_mask->ne[2], causal_mask->ne[3],
-                causal_mask->nb[1], causal_mask->nb[2], causal_mask->nb[3], 0);
-
-    ggml_tensor * chunked_diag_mask =
-        ggml_view_4d(ctx0, causal_diag_mask, chunk_size,
-                chunk_size,              causal_diag_mask->ne[2], causal_diag_mask->ne[3],
-                causal_diag_mask->nb[1], causal_diag_mask->nb[2], causal_diag_mask->nb[3], 0);
-
-    ggml_tensor * chunked_identity =
-        ggml_view_4d(ctx0, identity, chunk_size,
-            chunk_size,      identity->ne[2], identity->ne[3],
-            identity->nb[1], identity->nb[2], identity->nb[3], 0);
-
     const int64_t HB = H_k * n_seqs;
 
     q      = ggml_cont_4d(ctx0, q,      S_k, chunk_size, n_chunks, HB);
@@ -588,6 +574,14 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
     ggml_tensor * gk_cumsum = ggml_cumsum(ctx0, gk);
     cb(gk_cumsum, "gk_cumsum", il);
 
+/*
+    https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/kda/naive.py
+
+    for i in range(T):
+        k_i = k[..., i, :]
+        g_i = g[..., i:i+1, :]
+        A[..., i] = torch.einsum('... c d, ... d -> ... c', k * (g - g_i).exp(), k_i)
+*/
     const int64_t CHB = n_chunks * H_v * n_seqs;
 
     ggml_tensor * g_i = ggml_reshape_4d(ctx0, gk_cumsum, chunk_size, 1, S_k, CHB);
@@ -599,9 +593,9 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
 
     cb(decay_mask, "decay_mask", il);
 
-    decay_mask = ggml_mul(ctx0, decay_mask, chunked_diag_mask);
+    decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
     decay_mask = ggml_exp(ctx0, decay_mask);
-    decay_mask = ggml_mul(ctx0, decay_mask, chunked_diag_mask);
+    decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
     cb(decay_mask, "decay_mask_exp", il);
 
 // k [S,BT,NT,H*B] k_per [BT,S,NT,H*B]
@@ -620,19 +614,27 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
     Akk = ggml_reshape_4d(ctx0, Akk, chunk_size, chunk_size, n_chunks, H_k * n_seqs);
 
     Akk = ggml_mul(ctx0, Akk, beta);
-    Akk = ggml_neg(ctx0, ggml_mul(ctx0, Akk, chunked_mask));
+    Akk = ggml_neg(ctx0, ggml_mul(ctx0, Akk, causal_mask));
 
     cb(Akk, "attn_pre_solve", il);
 
-    ggml_tensor * attn_lower = ggml_mul(ctx0, Akk, chunked_mask);
-    ggml_tensor * lhs        = ggml_sub(ctx0, ggml_repeat(ctx0, chunked_identity, attn_lower), attn_lower);
+    // for i in range(1, chunk_size):
+    //          row = attn[..., i, :i].clone()
+    //          sub = attn[..., :i, :i].clone()
+    //          attn[..., i, :i] = row + (row.unsqueeze(-1) * sub).sum(-2)
+    // attn = attn + torch.eye(chunk_size, dtype=attn.dtype, device=attn.device)
+    //
+    // We reduce this to a linear triangular solve: AX = B, where B = attn, A = I - tril(A)
+    ggml_tensor * attn_lower = ggml_mul(ctx0, Akk, causal_mask);
+    ggml_tensor * lhs        = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower);
 
     ggml_tensor * lin_solve  = ggml_solve_tri(ctx0, lhs, Akk, true, true, false);
-    Akk                      = ggml_mul(ctx0, lin_solve, chunked_mask);
-    Akk                      = ggml_add(ctx0, Akk, chunked_identity);
+    Akk                      = ggml_mul(ctx0, lin_solve, causal_mask);
+    Akk                      = ggml_add(ctx0, Akk, identity);
 
     cb(Akk, "attn_solved", il);
 
+    // u = (A*beta[..., None, :]) @ v  aka U_[t]
     ggml_tensor * vb = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), Akk);
 
     gk_cumsum = ggml_cont_4d(ctx0, ggml_permute(ctx0, gk_cumsum, 1, 0, 2, 3), S_k, chunk_size, n_chunks, HB);
@@ -650,7 +652,6 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
     cb(new_state, "new_state", il);
 
     for (int64_t chunk = 0; chunk < n_chunks; chunk++) {
-//    for (int64_t chunk = 0; chunk < 1; chunk++) {
 // extract one chunk worth of data
         auto chunkify = [=](ggml_tensor * t) {
                     return ggml_cont(ctx0, ggml_view_4d(ctx0, t, t->ne[0], chunk_size, 1, t->ne[3],
@@ -672,15 +673,22 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
         ggml_tensor * gk_cs_chunk_j_bc = ggml_repeat_4d(ctx0, gk_cs_chunk_j, chunk_size, chunk_size, S_k, HB);
         ggml_tensor * decay_mask_chunk = ggml_sub(ctx0, gk_cs_chunk_j_bc, gk_cs_chunk_i);
         cb(decay_mask_chunk, "decay_mask_chunk", il);
-        decay_mask_chunk = ggml_mul(ctx0, decay_mask_chunk, chunked_diag_mask);
+        decay_mask_chunk = ggml_mul(ctx0, decay_mask_chunk, diag_mask);
         decay_mask_chunk = ggml_exp(ctx0, decay_mask_chunk);
-        decay_mask_chunk = ggml_mul(ctx0, decay_mask_chunk, chunked_diag_mask);
+        decay_mask_chunk = ggml_mul(ctx0, decay_mask_chunk, diag_mask);
         cb(decay_mask_chunk, "decay_mask_chunk_exp", il);
 
         ggml_tensor * k_cumdecay_chunk = chunkify(k_cumdecay);
 
         ggml_tensor * gkexp_chunk = ggml_exp(ctx0, gk_cs_chunk);
+/*
+    https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/kda/naive.py
 
+        for j in range(BT):
+            k_j = k[:, :, i, j]
+            g_j = g[:, :, i, j:j+1, :]
+            A[..., j] = torch.einsum('... c d, ... d -> ... c', q_i * (g_i - g_j).exp(), k_j)
+*/
         ggml_tensor * k_chunk_i = ggml_cont(ctx0, ggml_permute(ctx0, k_chunk, 2, 0, 1, 3));
         ggml_tensor * k_chunk_i_bc = ggml_repeat_4d(ctx0, k_chunk_i, chunk_size, chunk_size, S_k, HB);
         ggml_tensor * q_chunk_j = ggml_cont(ctx0, ggml_permute(ctx0, q_chunk, 2, 1, 0, 3));
@@ -689,7 +697,7 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
         kq = ggml_mul(ctx0, kq, k_chunk_i_bc);
 
         ggml_tensor * Aqk = ggml_mul(ctx0, kq, decay_mask_chunk);
-        Aqk = ggml_mul(ctx0, Aqk, ggml_add(ctx0, chunked_identity, chunked_mask));
+        Aqk = ggml_mul(ctx0, Aqk, ggml_add(ctx0, identity, causal_mask));
         Aqk = ggml_cont(ctx0, ggml_permute(ctx0, Aqk, 1, 2, 0, 3));
         Aqk = ggml_sum_rows(ctx0, Aqk);
         Aqk = ggml_scale(ctx0, Aqk, scale); // scale q
@@ -697,20 +705,26 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
 
         ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs);
 
-// new_state [S,S,1,H*B] k_cumdecay_chunk [S,BT,1,H*B]
+        // new_state [S,S,1,H*B] k_cumdecay_chunk [S,BT,1,H*B]
+        // v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state or W_[t] @ S_[t]
         ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay_chunk);
 
+        // v_new = v_i - v_prime or U_[t] - W_[t]*S_[t]
         ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, vb_chunk, v_prime), v_prime);
         ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new));
 
-// q_chunk [S,BT,1,H*B] gkexp_chunk [S,BT,1,H*B]
+        // q_chunk [S,BT,1,H*B] gkexp_chunk [S,BT,1,H*B]
+        // attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
+        // or Gamma_[t]*Q_]t] @ S
         ggml_tensor * q_gk_exp   = ggml_mul(ctx0, q_chunk, gkexp_chunk);
         ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_gk_exp);
         attn_inter = ggml_scale(ctx0, attn_inter, scale); // scale q
 
-// v_new_t [S,BT,1,H*B] Aqk [BT,BT,1,H*B]
+        // v_new_t [S,BT,1,H*B] Aqk [BT,BT,1,H*B]
+        // core_attn_out[:, :, i] = attn_inter + attn @ v_new or A' @ (U_[t] - W_[t]*S_[t])
         ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, Aqk);
 
+        // o[:, :, i] = (q_i * g_i.exp()) @ S + A @ v_i
         ggml_tensor * core_attn_out_chunk = ggml_add(ctx0, attn_inter, v_attn);
 
         core_attn_out = core_attn_out == nullptr ? core_attn_out_chunk : ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 1);
@@ -728,6 +742,7 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
 
         ggml_tensor * key_gkdiff = ggml_mul(ctx0, k_chunk, gk_diff_exp);
 
+        // rearrange((g_i[:,:,-1:] - g_i).exp()*k_i, 'b h c k -> b h k c') @ (U_[t] - W_[t] @ S)
         ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, ggml_cont(ctx0, ggml_transpose(ctx0, key_gkdiff)));
 
         new_state = ggml_add(ctx0,
@@ -750,256 +765,98 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
     return ggml_concat(ctx0, flat_output, flat_state, 0);
 }
 
-ggml_tensor * llm_build_kimi_linear::build_kda_recurrent(
+ggml_tensor * llm_build_kimi_linear::build_kda_autoregressive( 
     ggml_tensor * q,
     ggml_tensor * k,
     ggml_tensor * v,
     ggml_tensor * gk,
     ggml_tensor * beta,
-    ggml_tensor * state,
-    ggml_tensor * causal_mask,
-    ggml_tensor * identity,
+    ggml_tensor * state, 
     int il) {
     GGML_ASSERT(ggml_is_contiguous(q));
     GGML_ASSERT(ggml_is_contiguous(k));
-    GGML_ASSERT(ggml_is_contiguous(v));
+    GGML_ASSERT(ggml_is_contiguous(v)); 
     GGML_ASSERT(ggml_is_contiguous(gk));
     GGML_ASSERT(ggml_is_contiguous(beta));
     GGML_ASSERT(ggml_is_contiguous(state));
-    
+           
     const int64_t S_k      = q->ne[0];
     const int64_t H_k      = q->ne[1];
     const int64_t n_tokens = q->ne[2];
     const int64_t n_seqs   = q->ne[3];
-    
+
     const int64_t S_v = v->ne[0];
     const int64_t H_v = v->ne[1];
-    
+
+    GGML_ASSERT(n_tokens == 1);
     GGML_ASSERT(v->ne[2] == n_tokens);
     GGML_ASSERT(k->ne[2] == n_tokens);
-    GGML_ASSERT(gk->ne[0] == S_k && gk->ne[1] == H_v && gk->ne[2] == n_tokens && gk->ne[3] == n_seqs);
+    GGML_ASSERT(gk->ne[0] == S_k && gk->ne[1] == H_k && gk->ne[2] == n_tokens && gk->ne[3] == n_seqs);
     GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
-    GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v && state->ne[2] == H_v && state->ne[3] == n_seqs);
-    
+    GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_k && state->ne[2] == H_v && state->ne[3] == n_seqs);
+
     GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
     GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
-    
-    GGML_ASSERT(H_k == H_v);  // we did a repeat to make sure this is the case
-    
-    // TODO: can this ever be false?
-    const bool use_qk_l2norm = true;
-    
-    if (use_qk_l2norm) {
-        const float eps_norm = hparams.f_norm_rms_eps;
-    
-        q = ggml_l2_norm(ctx0, q, eps_norm);
-        k = ggml_l2_norm(ctx0, k, eps_norm);
-    }
-    
-    const float scale = 1.0f / sqrtf(S_v);
-    
-    beta = ggml_sigmoid(ctx0, beta);
 
-    ggml_tensor * causal_diag_mask = ggml_add(ctx0, causal_mask, identity);
+    GGML_ASSERT(H_k == H_v);  // we did a repeat to make sure this is the case
+
+    const float eps_norm = hparams.f_norm_rms_eps;
+
+    q = ggml_l2_norm(ctx0, q, eps_norm);
+    k = ggml_l2_norm(ctx0, k, eps_norm);
+
+    const float scale = 1.0f / sqrtf(S_v);
+
+    q    = ggml_scale(ctx0, q, scale);
+    beta = ggml_sigmoid(ctx0, beta);
 
     cb(q, "q_in", il);
     cb(k, "k_in", il);
     cb(v, "v_in", il);
     cb(beta, "beta_in", il);
     cb(gk, "gk_in", il);
-    
-    q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
-    k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
-    v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
-    gk = ggml_cont_4d(ctx0, ggml_permute(ctx0, gk, 1, 2, 0, 3), n_tokens, S_k, H_k, n_seqs);
-    
-    beta  = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3));
-    state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
-    
-    cb(q, "q_perm", il);
-    cb(k, "k_perm", il);
-    cb(v, "v_perm", il);
-    cb(beta, "beta_perm", il);
-    cb(gk, "gk_perm", il);
-    cb(state, "state_in", il);
-    
-    GGML_ASSERT(q->ne[1] == n_tokens && q->ne[0] == S_k && q->ne[2] == H_k && q->ne[3] == n_seqs);
-    GGML_ASSERT(k->ne[1] == n_tokens && k->ne[0] == S_k && k->ne[2] == H_k && k->ne[3] == n_seqs);
-    GGML_ASSERT(v->ne[1] == n_tokens && v->ne[0] == S_v && v->ne[2] == H_k && v->ne[3] == n_seqs);
-    GGML_ASSERT(beta->ne[1] == n_tokens && beta->ne[2] == H_k && beta->ne[0] == 1 && beta->ne[3] == n_seqs);
 
-    // =========================================================================
-    // Compute cumulative sum of gk per key dimension
-    // gk_cumsum: [S_k, n_tokens, H_k, n_seqs] - cumsum along dim 1 (tokens)
-    // =========================================================================
-    ggml_tensor * gk_cumsum = ggml_cumsum(ctx0, gk);
-    cb(gk_cumsum, "gk_cumsum", il);
+// g [H,1,B,1] g_t [1,H,B,1] => [1,1,H,B]
+// gk [S,H,1,B] => [S,1,H,B] gk_t [1,S,H,B]
+// beta [H,1,1,B] beta_t [1,H,1,B] => [1,1,H,B]
+    gk = ggml_reshape_4d(ctx0, gk, S_k, 1, H_k, n_seqs);
+    ggml_tensor * gk_t = ggml_cont(ctx0, ggml_transpose(ctx0, gk));
+    ggml_tensor * beta_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, beta), 1, 1, H_k, n_seqs);
 
-    // Scale k and k_beta
-    
-    ggml_tensor * k_beta = ggml_mul(ctx0, k, beta);
-    ggml_tensor * v_beta = ggml_mul(ctx0, v, beta);
-    
-    cb(k_beta, "k_beta", il);
-    cb(v_beta, "v_beta", il);
+    // Apply exponential to gk_t
+    gk_t = ggml_exp(ctx0, gk_t);
+    // Apply the gated delta rule for the single timestep
+    // last_recurrent_state = last_recurrent_state * gk_t
+    // S = S * g_i[..., None].exp()
+    state = ggml_mul(ctx0, state, gk_t);
 
-
-/*
-    https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/kda/naive.py
-
-    for i in range(T):
-        k_i = k[..., i, :]
-        g_i = g[..., i:i+1, :]
-        A[..., i] = torch.einsum('... c d, ... d -> ... c', k * (g - g_i).exp(), k_i)
-*/
-    const int64_t HB = H_k * n_seqs;
-    ggml_tensor * k_per = ggml_cont(ctx0, ggml_permute(ctx0, k, 1, 0, 2, 3));
-    ggml_tensor * k_i = ggml_reshape_4d(ctx0, k_per, n_tokens, 1, S_k, HB);
-    ggml_tensor * k_i_bc = ggml_repeat_4d(ctx0, k_i, n_tokens, n_tokens, S_k, HB);
-    ggml_tensor * g_i = ggml_reshape_4d(ctx0, gk_cumsum, n_tokens, 1, S_k, HB);
-    ggml_tensor * g_i_bc = ggml_repeat_4d(ctx0, g_i, n_tokens, n_tokens, S_k, HB);  // [S_k, chunk_size, 1, HB] -> [S_k, chunk_size, chunk_size, HB]
-    
-    ggml_tensor * k_j = ggml_reshape_4d(ctx0, k_per, 1, n_tokens, S_k, HB);
-    ggml_tensor * k_j_bc = ggml_repeat_4d(ctx0, k_j, n_tokens, n_tokens, S_k, HB);
-    
-    ggml_tensor * g_j = ggml_reshape_4d(ctx0, gk_cumsum, 1, n_tokens, S_k, HB);
-    ggml_tensor * g_j_bc = ggml_repeat_4d(ctx0, g_j, n_tokens, n_tokens, S_k, HB);  // [S_k, 1, chunk_size, HB] -> [S_k, chunk_size, chunk_size, HB]
-    
-    ggml_tensor * decay_mask = ggml_sub(ctx0, g_j_bc, g_i_bc);
-    cb(decay_mask, "decay_mask", il);
-    decay_mask = ggml_mul(ctx0, decay_mask, causal_diag_mask);
-    decay_mask = ggml_exp(ctx0, decay_mask);
-    decay_mask = ggml_mul(ctx0, decay_mask, causal_diag_mask);
-    cb(decay_mask, "decay_mask_exp", il);
-    
-    ggml_tensor * Akk = ggml_mul(ctx0, decay_mask, k_j_bc);
-    Akk = ggml_mul(ctx0, Akk, k_i_bc);
-    
-    Akk = ggml_cont(ctx0, ggml_permute(ctx0, Akk, 1, 2, 0, 3));
-    Akk = ggml_sum_rows(ctx0, Akk);
-    
-    Akk = ggml_reshape_4d(ctx0, Akk, n_tokens, n_tokens, H_k, n_seqs);
-
-    Akk = ggml_mul(ctx0, Akk, beta);
-    Akk = ggml_neg(ctx0, ggml_mul(ctx0, Akk, causal_mask));
-
-    cb(Akk, "attn_pre_rec", il);
-    
-    // for i in range(1, chunk_size):
-    //          row = attn[..., i, :i].clone()
-    //          sub = attn[..., :i, :i].clone()
-    //          attn[..., i, :i] = row + (row.unsqueeze(-1) * sub).sum(-2)
-    // attn = attn + torch.eye(chunk_size, dtype=attn.dtype, device=attn.device)
-    //
-    // We reduce this to a linear triangular solve: AX = B, where B = attn, A = I - tril(A)
-    ggml_tensor * attn_lower = ggml_mul(ctx0, Akk, causal_mask);
-    ggml_tensor * lhs        = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower);
-    
-    ggml_tensor * lin_solve  = ggml_solve_tri(ctx0, lhs, Akk, true, true, false);
-    Akk                      = ggml_mul(ctx0, lin_solve, causal_mask);
-    Akk                      = ggml_add(ctx0, Akk, identity);
-
-    gk_cumsum = ggml_cont(ctx0, ggml_permute(ctx0, gk_cumsum, 1, 0, 2, 3));  // back to [S_k, n_tokens, H_k, n_seqs]
-
-    // u = (A*beta[..., None, :]) @ v  aka U_[t]
-    ggml_tensor * vb = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), Akk);
-    cb(vb, "value_beta", il);
-    
-    // k_cumdecay = attn @ (k_beta * g.exp().unsqueeze(-1)) or W_[t]
-    ggml_tensor * gkexp      = ggml_exp(ctx0, gk_cumsum); // [S,T,H,B]
-
-    ggml_tensor * kbeta_gkexp = ggml_mul(ctx0, k_beta, gkexp);
-    cb(kbeta_gkexp, "kbeta_gkexp", il);
-
-    ggml_tensor * k_cumdecay = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gkexp)), Akk);
-    cb(k_cumdecay, "k_cumdecay", il);
-
-/*
-    https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/kda/naive.py
-
-        for j in range(BT):
-            k_j = k[:, :, i, j]
-            g_j = g[:, :, i, j:j+1, :]
-            A[..., j] = torch.einsum('... c d, ... d -> ... c', q_i * (g_i - g_j).exp(), k_j)
-*/
-    ggml_tensor * q_per = ggml_cont(ctx0, ggml_permute(ctx0, q, 1, 0, 2, 3));
-    ggml_tensor * q_j = ggml_reshape_4d(ctx0, q_per, 1, n_tokens, S_k, HB);
-    ggml_tensor * q_j_bc = ggml_repeat_4d(ctx0, q_j, n_tokens, n_tokens, S_k, HB);
-    ggml_tensor * kq = ggml_mul(ctx0, decay_mask, q_j_bc);
-    kq = ggml_mul(ctx0, kq, k_i_bc);
-    kq = ggml_cont(ctx0, ggml_permute(ctx0, kq, 1, 2, 0, 3));
-    
-    ggml_tensor * Aqk = ggml_sum_rows(ctx0, kq);
-    Aqk = ggml_cont(ctx0, ggml_reshape_4d(ctx0, Aqk, n_tokens, n_tokens, H_k, n_seqs));
-    Aqk = ggml_mul(ctx0, Aqk, ggml_add(ctx0, identity, causal_mask));
-    Aqk = ggml_scale(ctx0, Aqk, scale); // scale q
-    cb(Aqk, "attn_decay_key", il);
-    
     ggml_tensor * state_t = ggml_cont(ctx0, ggml_transpose(ctx0, state));
-    
-    // v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state or W_[t] @ S_[t]
-    ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay);
 
-    cb(v_prime, "v_prime", il);
+// state [S,S,H,B] k [S,1,H,B] k_state [S_v,1,H,B]
+    k = ggml_reshape_4d(ctx0, k, S_k, 1, H_k, n_seqs);
+    ggml_tensor * k_state = ggml_mul_mat(ctx0, state_t, k);
 
-    // v_new = v_i - v_prime or U_[t] - W_[t]*S_[t]
-    ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, vb, v_prime), v_prime);
+    // v_i - (k_i[..., None] * S).sum(-2)
+    v = ggml_reshape_4d(ctx0, v, S_v, 1, H_v, n_seqs);
+    ggml_tensor * v_diff = ggml_sub(ctx0, v, k_state);
 
-    // v_new_t [T.S.H,B]
-    ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new));
+    // b_i[..., None] * k_i
+    ggml_tensor * k_beta = ggml_mul(ctx0, k, beta_t);
 
-    cb(v_new, "v_new", il);
+    // S = S + torch.einsum('b h k, b h v -> b h k v', b_i[..., None] * k_i, v_i - (k_i[..., None] * S).sum(-2))
+    // v_diff_t [1,S_v,H,B] k_beta_t [1,S_k,H,B] state [S_v,S_k,H,B]
+    state = ggml_add(ctx0, state, ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_diff)), ggml_cont(ctx0, ggml_transpose(ctx0, k_beta))));
 
-    // attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
-    // or Gamma_[t]*Q_]t] @ S
-    ggml_tensor * q_gk_exp    = ggml_mul(ctx0, q, gkexp);
-    ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_gk_exp);
-    // scale q at attn_inter as suggested in chunk_gla_fwd_kernel_o of
-    // github.com/fla-org/flash-linear-attention/fla/ops/gla/chunk.py
-    attn_inter = ggml_scale(ctx0, attn_inter, scale); // scale q
-
-    cb(attn_inter, "attn_inter", il);
-
-    // core_attn_out[:, :, i] = attn_inter + attn @ v_new or A' @ (U_[t] - W_[t]*S_[t])
-    ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, Aqk);
-
-    cb(v_attn, "v_attn", il);
-
-    // o[:, :, i] = (q_i * g_i.exp()) @ S + A @ v_i
-    ggml_tensor * core_attn_out = ggml_add(ctx0, attn_inter, v_attn);
-
-    cb(core_attn_out, "core_attn_out", il);
-
-    ggml_tensor * gk_cum_last =
-        ggml_cont(ctx0, ggml_view_4d(ctx0, gk_cumsum, gk_cumsum->ne[0], 1, gk_cumsum->ne[2], gk_cumsum->ne[3],
-                                    gk_cumsum->nb[1], gk_cumsum->nb[2], gk_cumsum->nb[3],
-                                    gk_cumsum->nb[1] * (gk_cumsum->ne[1] - 1)));
-    cb(gk_cum_last, "gk_cum_last", il);
-
-    ggml_tensor * gkexp_last = ggml_exp(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, gk_cum_last)));
-    cb(gkexp_last, "gkexp_last", il);
-
-    ggml_tensor * gk_diff = ggml_neg(ctx0, ggml_sub(ctx0, gk_cumsum, gk_cum_last));
-    cb(gk_diff, "gk_diff", il);
-
-    ggml_tensor * gk_diff_exp = ggml_exp(ctx0, gk_diff);
-    cb(gk_diff_exp, "gk_diff_exp", il);
-
-    ggml_tensor * key_gkdiff = ggml_mul(ctx0, k, gk_diff_exp);
-    cb(key_gkdiff, "key_gkdiff", il);
-
-    // rearrange((g_i[:,:,-1:] - g_i).exp()*k_i, 'b h c k -> b h k c') @ (U_[t] - W_[t] @ S)
-    ggml_tensor * kgkdmulvnew = ggml_mul_mat(ctx0, v_new_t, ggml_cont(ctx0, ggml_transpose(ctx0, key_gkdiff)));
-    cb(kgkdmulvnew, "kgkdmulvnew", il);
-
-    state = ggml_add(ctx0, ggml_mul(ctx0, state, gkexp_last), kgkdmulvnew);
+    q = ggml_reshape_4d(ctx0, q, S_k, 1, H_k, n_seqs);
+    state_t = ggml_cont(ctx0, ggml_transpose(ctx0, state));
+    ggml_tensor * core_attn_out = ggml_mul_mat(ctx0, state_t, q);
+    // core_attn_out should be [S_v, 1, H_v, n_seqs] after this
+    cb(core_attn_out, "output_tokens", il);
     cb(state, "new_state", il);
 
-    // flatten output
-    ggml_tensor * flat_output =
-        ggml_cont_1d(ctx0, ggml_permute(ctx0, core_attn_out, 0, 2, 1, 3), S_v * H_v * n_tokens * n_seqs);
-
-    ggml_tensor * flat_state = ggml_cont_1d(ctx0, state, S_v * S_v * H_v * n_seqs);
+    // flatten output, no need to permute since n_tokens is 1 so [S_v, 1, H_v, n_seqs] and [S_v, H_v, 1, n_seqs] are equivalent memory-layout wise
+    ggml_tensor * flat_output = ggml_reshape_1d(ctx0, core_attn_out, S_v * H_v * n_tokens * n_seqs);
+    ggml_tensor * flat_state  = ggml_reshape_1d(ctx0, state, S_v * S_v * H_v * n_seqs);
 
     return ggml_concat(ctx0, flat_output, flat_state, 0);
 }
diff --git a/src/models/models.h b/src/models/models.h
index ba2b905c5e..3ed00aae32 100644
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -288,26 +288,25 @@ struct llm_build_kimi_linear : public llm_graph_context_mamba {
     llm_build_kimi_linear(const llama_model & model, const llm_graph_params & params);
 private:
     const llama_model & model;
-    ggml_tensor * build_kda_recurrent(
+    ggml_tensor * build_kda_autoregressive(
                 ggml_tensor * q,
                 ggml_tensor * k,
                 ggml_tensor * v,
-                ggml_tensor * g,
+                ggml_tensor * gk,
                 ggml_tensor * beta,
                 ggml_tensor * state,
-                ggml_tensor * causal_mask,
-                ggml_tensor * identity,
                         int   il);
 
     ggml_tensor * build_kda_chunking(
                 ggml_tensor * q,
                 ggml_tensor * k,
                 ggml_tensor * v,
-                ggml_tensor * g,
+                ggml_tensor * gk,
                 ggml_tensor * beta,
                 ggml_tensor * state,
                 ggml_tensor * causal_mask,
                 ggml_tensor * identity,
+                ggml_tensor * diag_mask,
                         int   il);
 };
 

From f99913dd5fb15027839d7986bd13b1e5dc4cb60f Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Thu, 8 Jan 2026 13:40:17 +0800
Subject: [PATCH 28/58] replaced Akk and Aqk with mul_mat and clamp

---
 src/models/kimi-linear.cpp | 105 +++++++++++++++----------------------
 1 file changed, 43 insertions(+), 62 deletions(-)

diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp
index 270f9e6e6b..b229d31165 100644
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@@ -571,48 +571,40 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
 
     // switch for cumsum
     gk = ggml_cont_4d(ctx0, ggml_permute(ctx0, gk, 1, 0, 2, 3), chunk_size, S_k, n_chunks, HB);
+    cb(gk, "gk", il);
     ggml_tensor * gk_cumsum = ggml_cumsum(ctx0, gk);
     cb(gk_cumsum, "gk_cumsum", il);
+    // switch back for downstream
+    gk_cumsum = ggml_cont_4d(ctx0, ggml_permute(ctx0, gk_cumsum, 1, 0, 2, 3), S_k, chunk_size, n_chunks, HB);
+    ggml_tensor * gkexp      = ggml_exp(ctx0, gk_cumsum);
+
+    cb(gk_cumsum, "gk_cumsum", il);
 
 /*
-    https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/kda/naive.py
-
-    for i in range(T):
-        k_i = k[..., i, :]
-        g_i = g[..., i:i+1, :]
+    for i in range(BT):
+        k_i = k[..., i, :] # k_i [B,H,NT,S]
+        g_i = g[..., i:i+1, :] # g_i [B,H,NT,1,S]
         A[..., i] = torch.einsum('... c d, ... d -> ... c', k * (g - g_i).exp(), k_i)
 */
-    const int64_t CHB = n_chunks * H_v * n_seqs;
+    // gk_ref: [S, 1, C, HB] - first token of i_block
+    ggml_tensor * gk_ref = ggml_view_4d(ctx0, gk_cumsum,
+        S_k, 1, n_chunks, HB,
+        gk_cumsum->nb[1], gk_cumsum->nb[2], gk_cumsum->nb[3],
+        0);
+    cb(gk_ref, "gk_ref", il);
 
-    ggml_tensor * g_i = ggml_reshape_4d(ctx0, gk_cumsum, chunk_size, 1, S_k, CHB);
-    ggml_tensor * g_j = ggml_reshape_4d(ctx0, gk_cumsum, 1, chunk_size, S_k, CHB);
-
-    ggml_tensor * g_j_bc = ggml_repeat_4d(ctx0, g_j, chunk_size, chunk_size, S_k, CHB);
-
-    ggml_tensor * decay_mask = ggml_sub(ctx0, g_j_bc, g_i);
-
-    cb(decay_mask, "decay_mask", il);
-
-    decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
-    decay_mask = ggml_exp(ctx0, decay_mask);
-    decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
-    cb(decay_mask, "decay_mask_exp", il);
-
-// k [S,BT,NT,H*B] k_per [BT,S,NT,H*B]
-    ggml_tensor * k_per = ggml_cont(ctx0, ggml_permute(ctx0, k, 1, 0, 2, 3));
-    ggml_tensor * k_i = ggml_reshape_4d(ctx0, k_per, chunk_size, 1, S_k, CHB);
-    ggml_tensor * k_i_bc = ggml_repeat_4d(ctx0, k_i, chunk_size, chunk_size, S_k, CHB);
-    ggml_tensor * k_j = ggml_reshape_4d(ctx0, k_per, 1,  chunk_size, S_k, CHB);
-    ggml_tensor * k_j_bc = ggml_repeat_4d(ctx0, k_j, chunk_size, chunk_size, S_k, CHB);
-
-    ggml_tensor * Akk = ggml_mul(ctx0, decay_mask, k_j_bc);
-    Akk = ggml_mul(ctx0, Akk, k_i_bc);
-
-    Akk = ggml_cont(ctx0, ggml_permute(ctx0, Akk, 1, 2, 0, 3));
-    Akk = ggml_sum_rows(ctx0, Akk);
-
-    Akk = ggml_reshape_4d(ctx0, Akk, chunk_size, chunk_size, n_chunks, H_k * n_seqs);
+    // Compute gk_diff
+    ggml_tensor * gk_diff_j = ggml_sub(ctx0, gk_cumsum, ggml_repeat(ctx0, gk_ref, gk_cumsum));
+    ggml_tensor * gk_diff_i = ggml_clamp(ctx0, ggml_neg(ctx0, gk_diff_j), 0.0f, 88.0f);
+    cb(gk_diff_j, "gk_diff_j", il);
+    cb(gk_diff_i, "gk_diff_i", il);
 
+    // Decay k
+    ggml_tensor * k_exp_j = ggml_mul(ctx0, k, ggml_exp(ctx0, gk_diff_j));
+    ggml_tensor * k_exp_i = ggml_mul(ctx0, k, ggml_exp(ctx0, gk_diff_i));
+    ggml_tensor * Akk = ggml_mul_mat(ctx0, k_exp_i, k_exp_j);
+    cb(Akk, "Akk", il);
+            
     Akk = ggml_mul(ctx0, Akk, beta);
     Akk = ggml_neg(ctx0, ggml_mul(ctx0, Akk, causal_mask));
 
@@ -637,9 +629,6 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
     // u = (A*beta[..., None, :]) @ v  aka U_[t]
     ggml_tensor * vb = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), Akk);
 
-    gk_cumsum = ggml_cont_4d(ctx0, ggml_permute(ctx0, gk_cumsum, 1, 0, 2, 3), S_k, chunk_size, n_chunks, HB);
-    ggml_tensor * gkexp      = ggml_exp(ctx0, gk_cumsum);
-
     ggml_tensor * kbeta_gkexp = ggml_mul(ctx0, k_beta, gkexp);
     cb(kbeta_gkexp, "kbeta_gkexp", il);
 
@@ -663,23 +652,9 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
         ggml_tensor * q_chunk = chunkify(q);
         ggml_tensor * vb_chunk = chunkify(vb);
 
-        // Since decay_mask now has dimension of [BT,BT,S,NT*H*B], it can't be chunkified
-        // decay_mask_chunk needs to be recomputed
 // gk_cumsum [S,BT,NT,H*B] => gk_cs_chunk [S,BT,1,H*B]
         ggml_tensor * gk_cs_chunk = chunkify(gk_cumsum);
-        ggml_tensor * gk_cs_chunk_i = ggml_cont(ctx0, ggml_permute(ctx0, gk_cs_chunk, 2, 0, 1, 3));
-        ggml_tensor * gk_cs_chunk_j = ggml_cont(ctx0, ggml_permute(ctx0, gk_cs_chunk, 2, 1, 0, 3));
-
-        ggml_tensor * gk_cs_chunk_j_bc = ggml_repeat_4d(ctx0, gk_cs_chunk_j, chunk_size, chunk_size, S_k, HB);
-        ggml_tensor * decay_mask_chunk = ggml_sub(ctx0, gk_cs_chunk_j_bc, gk_cs_chunk_i);
-        cb(decay_mask_chunk, "decay_mask_chunk", il);
-        decay_mask_chunk = ggml_mul(ctx0, decay_mask_chunk, diag_mask);
-        decay_mask_chunk = ggml_exp(ctx0, decay_mask_chunk);
-        decay_mask_chunk = ggml_mul(ctx0, decay_mask_chunk, diag_mask);
-        cb(decay_mask_chunk, "decay_mask_chunk_exp", il);
-
         ggml_tensor * k_cumdecay_chunk = chunkify(k_cumdecay);
-
         ggml_tensor * gkexp_chunk = ggml_exp(ctx0, gk_cs_chunk);
 /*
     https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/kda/naive.py
@@ -689,19 +664,25 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
             g_j = g[:, :, i, j:j+1, :]
             A[..., j] = torch.einsum('... c d, ... d -> ... c', q_i * (g_i - g_j).exp(), k_j)
 */
-        ggml_tensor * k_chunk_i = ggml_cont(ctx0, ggml_permute(ctx0, k_chunk, 2, 0, 1, 3));
-        ggml_tensor * k_chunk_i_bc = ggml_repeat_4d(ctx0, k_chunk_i, chunk_size, chunk_size, S_k, HB);
-        ggml_tensor * q_chunk_j = ggml_cont(ctx0, ggml_permute(ctx0, q_chunk, 2, 1, 0, 3));
-        ggml_tensor * q_chunk_j_bc = ggml_repeat_4d(ctx0, q_chunk_j, chunk_size, chunk_size, S_k, HB);
-        ggml_tensor * kq = ggml_mul(ctx0, decay_mask_chunk, q_chunk_j_bc);
-        kq = ggml_mul(ctx0, kq, k_chunk_i_bc);
+        ggml_tensor * gk_ref_chunk = ggml_view_4d(ctx0, gk_cs_chunk,
+            S_k, 1, 1, HB,
+            gk_cs_chunk->nb[1], gk_cs_chunk->nb[2], gk_cs_chunk->nb[3],
+            0);
+        // Compute gk_diff
+        ggml_tensor * gk_diff_chunk_j = ggml_sub(ctx0, gk_cs_chunk, ggml_repeat(ctx0, gk_ref_chunk, gk_cs_chunk));
+        ggml_tensor * gk_diff_chunk_i = ggml_clamp(ctx0, ggml_neg(ctx0, gk_diff_chunk_j), 0.0f, 88.0f);
+        cb(gk_diff_chunk_j, "gk_diff_chunk_j", il);
+        cb(gk_diff_chunk_i, "gk_diff_chunk_i", il);
 
-        ggml_tensor * Aqk = ggml_mul(ctx0, kq, decay_mask_chunk);
-        Aqk = ggml_mul(ctx0, Aqk, ggml_add(ctx0, identity, causal_mask));
-        Aqk = ggml_cont(ctx0, ggml_permute(ctx0, Aqk, 1, 2, 0, 3));
-        Aqk = ggml_sum_rows(ctx0, Aqk);
+        // Decay q and k
+        ggml_tensor * q_exp_chunk = ggml_mul(ctx0, q_chunk, ggml_exp(ctx0, gk_diff_chunk_j));
+        ggml_tensor * k_exp_chunk = ggml_mul(ctx0, k_chunk, ggml_exp(ctx0, gk_diff_chunk_i));
+
+        ggml_tensor * Aqk = ggml_mul_mat(ctx0, k_exp_chunk, q_exp_chunk);
+        cb(Aqk, "Aqk", il);
+        Aqk = ggml_mul(ctx0, Aqk, diag_mask);
         Aqk = ggml_scale(ctx0, Aqk, scale); // scale q
-        Aqk = ggml_reshape_4d(ctx0, Aqk, chunk_size, chunk_size, 1, HB);
+        cb(Aqk, "Aqk_masked", il);
 
         ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs);
 

From 6150bb7b17fe15a7b8cac2c26f3da9e38dc72b5d Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Fri, 9 Jan 2026 20:11:45 +0800
Subject: [PATCH 29/58] no clamp version

---
 src/models/kimi-linear.cpp | 77 +++++++++++++++++++++-----------------
 1 file changed, 43 insertions(+), 34 deletions(-)

diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp
index b229d31165..93a4983c66 100644
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@@ -574,11 +574,6 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
     cb(gk, "gk", il);
     ggml_tensor * gk_cumsum = ggml_cumsum(ctx0, gk);
     cb(gk_cumsum, "gk_cumsum", il);
-    // switch back for downstream
-    gk_cumsum = ggml_cont_4d(ctx0, ggml_permute(ctx0, gk_cumsum, 1, 0, 2, 3), S_k, chunk_size, n_chunks, HB);
-    ggml_tensor * gkexp      = ggml_exp(ctx0, gk_cumsum);
-
-    cb(gk_cumsum, "gk_cumsum", il);
 
 /*
     for i in range(BT):
@@ -586,23 +581,31 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
         g_i = g[..., i:i+1, :] # g_i [B,H,NT,1,S]
         A[..., i] = torch.einsum('... c d, ... d -> ... c', k * (g - g_i).exp(), k_i)
 */
-    // gk_ref: [S, 1, C, HB] - first token of i_block
-    ggml_tensor * gk_ref = ggml_view_4d(ctx0, gk_cumsum,
-        S_k, 1, n_chunks, HB,
-        gk_cumsum->nb[1], gk_cumsum->nb[2], gk_cumsum->nb[3],
-        0);
-    cb(gk_ref, "gk_ref", il);
+    const int64_t CHB = n_chunks * H_k * n_seqs;
+    ggml_tensor * gkcs_i = ggml_reshape_4d(ctx0, gk_cumsum, chunk_size, 1, S_k, CHB);  // [chunk_size, 1, S_k, CHB] 
+    ggml_tensor * gkcs_j = ggml_reshape_4d(ctx0, gkcs_i, 1, chunk_size, S_k, CHB);  // [1, chunk_size, S_k, CHB]
 
-    // Compute gk_diff
-    ggml_tensor * gk_diff_j = ggml_sub(ctx0, gk_cumsum, ggml_repeat(ctx0, gk_ref, gk_cumsum));
-    ggml_tensor * gk_diff_i = ggml_clamp(ctx0, ggml_neg(ctx0, gk_diff_j), 0.0f, 88.0f);
-    cb(gk_diff_j, "gk_diff_j", il);
-    cb(gk_diff_i, "gk_diff_i", il);
+    ggml_tensor * gkcs_j_bc = ggml_repeat_4d(ctx0, gkcs_j, chunk_size, chunk_size, S_k, CHB);  // [1, chunk_size, S_k, CHB] -> [chunk_size, chunk_size, S_k, CHB]
+    // decay_mask [chunk_size,chunk_size,S_k,CHB]
+    ggml_tensor * decay_mask = ggml_sub(ctx0, gkcs_j_bc, gkcs_i);
+    cb(decay_mask, "decay_mask", il);
 
-    // Decay k
-    ggml_tensor * k_exp_j = ggml_mul(ctx0, k, ggml_exp(ctx0, gk_diff_j));
-    ggml_tensor * k_exp_i = ggml_mul(ctx0, k, ggml_exp(ctx0, gk_diff_i));
-    ggml_tensor * Akk = ggml_mul_mat(ctx0, k_exp_i, k_exp_j);
+    decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
+    cb(decay_mask, "decay_masked", il);
+    decay_mask = ggml_exp(ctx0, decay_mask);
+    decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
+
+    // decay_mask [S_k,BT_j,BT_i,CHB] *Note* second and third chunk_sizes are switched
+    decay_mask = ggml_cont_4d(ctx0, ggml_permute(ctx0, decay_mask, 2, 1, 0, 3), S_k, chunk_size, chunk_size, CHB);
+
+    ggml_tensor * k_i = ggml_cont(ctx0, ggml_reshape_4d(ctx0, k, S_k, chunk_size, 1, CHB));
+    ggml_tensor * k_j = ggml_cont(ctx0, ggml_reshape_4d(ctx0, k, S_k, 1, chunk_size, CHB));
+
+    ggml_tensor * decay_k_i = ggml_mul(ctx0, decay_mask, k_i);
+
+    // decay_k_i [S.BT,BT,CHB] @ k_j [S,1,BT,CHB] = Akk [BT,1,BT,CHB]
+    ggml_tensor * Akk = ggml_mul_mat(ctx0, k_j, decay_k_i);
+    Akk = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_4d(ctx0, Akk, chunk_size, chunk_size, n_chunks, HB)));
     cb(Akk, "Akk", il);
             
     Akk = ggml_mul(ctx0, Akk, beta);
@@ -626,6 +629,11 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
 
     cb(Akk, "attn_solved", il);
 
+    // switch back for downstream
+    gk_cumsum = ggml_cont_4d(ctx0, ggml_permute(ctx0, gk_cumsum, 1, 0, 2, 3), S_k, chunk_size, n_chunks, HB);
+    ggml_tensor * gkexp      = ggml_exp(ctx0, gk_cumsum);
+    cb(gk_cumsum, "gk_cumsum", il);
+
     // u = (A*beta[..., None, :]) @ v  aka U_[t]
     ggml_tensor * vb = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), Akk);
 
@@ -640,12 +648,19 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
 
     cb(new_state, "new_state", il);
 
+    // switch for chunkify_mask
+    decay_mask = ggml_cont(ctx0, ggml_reshape_4d(ctx0, decay_mask, S_k, chunk_size * chunk_size, n_chunks, HB));
     for (int64_t chunk = 0; chunk < n_chunks; chunk++) {
 // extract one chunk worth of data
         auto chunkify = [=](ggml_tensor * t) {
                     return ggml_cont(ctx0, ggml_view_4d(ctx0, t, t->ne[0], chunk_size, 1, t->ne[3],
                 t->nb[1], t->nb[2], t->nb[3], t->nb[2] * chunk));
         };
+        auto chunkify_mask = [=](ggml_tensor * t) {
+                    return ggml_cont(ctx0, ggml_view_4d(ctx0, t, t->ne[0], chunk_size*chunk_size, 1, t->ne[3],
+                t->nb[1], t->nb[2], t->nb[3], t->nb[2] * chunk));
+        };
+
 
 // k [S,BT,NT,H*B] => k_chunk [S,BT,1,H*B]
         ggml_tensor * k_chunk = chunkify(k);
@@ -656,6 +671,8 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
         ggml_tensor * gk_cs_chunk = chunkify(gk_cumsum);
         ggml_tensor * k_cumdecay_chunk = chunkify(k_cumdecay);
         ggml_tensor * gkexp_chunk = ggml_exp(ctx0, gk_cs_chunk);
+        ggml_tensor * decay_mask_chunk = chunkify_mask(decay_mask);
+        decay_mask_chunk = ggml_cont(ctx0, ggml_reshape_4d(ctx0, decay_mask_chunk, S_k, chunk_size, chunk_size, HB));
 /*
     https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/kda/naive.py
 
@@ -664,22 +681,14 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
             g_j = g[:, :, i, j:j+1, :]
             A[..., j] = torch.einsum('... c d, ... d -> ... c', q_i * (g_i - g_j).exp(), k_j)
 */
-        ggml_tensor * gk_ref_chunk = ggml_view_4d(ctx0, gk_cs_chunk,
-            S_k, 1, 1, HB,
-            gk_cs_chunk->nb[1], gk_cs_chunk->nb[2], gk_cs_chunk->nb[3],
-            0);
-        // Compute gk_diff
-        ggml_tensor * gk_diff_chunk_j = ggml_sub(ctx0, gk_cs_chunk, ggml_repeat(ctx0, gk_ref_chunk, gk_cs_chunk));
-        ggml_tensor * gk_diff_chunk_i = ggml_clamp(ctx0, ggml_neg(ctx0, gk_diff_chunk_j), 0.0f, 88.0f);
-        cb(gk_diff_chunk_j, "gk_diff_chunk_j", il);
-        cb(gk_diff_chunk_i, "gk_diff_chunk_i", il);
+        ggml_tensor * k_j_chunk = ggml_cont(ctx0, ggml_reshape_4d(ctx0, k_chunk, S_k, 1, chunk_size, HB));
+        ggml_tensor * q_i_chunk = ggml_cont(ctx0, ggml_reshape_4d(ctx0, q_chunk, S_k, chunk_size, 1, HB));
+        ggml_tensor * decay_q_i_chunk = ggml_mul(ctx0, decay_mask_chunk, q_i_chunk);
 
-        // Decay q and k
-        ggml_tensor * q_exp_chunk = ggml_mul(ctx0, q_chunk, ggml_exp(ctx0, gk_diff_chunk_j));
-        ggml_tensor * k_exp_chunk = ggml_mul(ctx0, k_chunk, ggml_exp(ctx0, gk_diff_chunk_i));
-
-        ggml_tensor * Aqk = ggml_mul_mat(ctx0, k_exp_chunk, q_exp_chunk);
+        ggml_tensor * Aqk = ggml_mul_mat(ctx0, decay_q_i_chunk, k_j_chunk);
+        Aqk = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_4d(ctx0, Aqk, chunk_size, chunk_size, 1, HB)));
         cb(Aqk, "Aqk", il);
+
         Aqk = ggml_mul(ctx0, Aqk, diag_mask);
         Aqk = ggml_scale(ctx0, Aqk, scale); // scale q
         cb(Aqk, "Aqk_masked", il);

From d26fe501786ed8e06d2b63f2d59f5daceaecfe87 Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Sat, 10 Jan 2026 08:45:57 +0800
Subject: [PATCH 30/58] Moved Aqk computation out of the loop

---
 src/models/kimi-linear.cpp | 49 ++++++++++++++++----------------------
 1 file changed, 21 insertions(+), 28 deletions(-)

diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp
index 93a4983c66..c55116bc69 100644
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@@ -576,10 +576,17 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
     cb(gk_cumsum, "gk_cumsum", il);
 
 /*
+    Compute Akk and Aqk loop together
+    Akk loop:
     for i in range(BT):
         k_i = k[..., i, :] # k_i [B,H,NT,S]
         g_i = g[..., i:i+1, :] # g_i [B,H,NT,1,S]
         A[..., i] = torch.einsum('... c d, ... d -> ... c', k * (g - g_i).exp(), k_i)
+    Aqk loop:
+    for j in range(BT):
+        k_j = k[:, :, i, j]
+        g_j = g[:, :, i, j:j+1, :]
+        A[..., j] = torch.einsum('... c d, ... d -> ... c', q_i * (g_i - g_j).exp(), k_j)
 */
     const int64_t CHB = n_chunks * H_k * n_seqs;
     ggml_tensor * gkcs_i = ggml_reshape_4d(ctx0, gk_cumsum, chunk_size, 1, S_k, CHB);  // [chunk_size, 1, S_k, CHB] 
@@ -600,19 +607,27 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
 
     ggml_tensor * k_i = ggml_cont(ctx0, ggml_reshape_4d(ctx0, k, S_k, chunk_size, 1, CHB));
     ggml_tensor * k_j = ggml_cont(ctx0, ggml_reshape_4d(ctx0, k, S_k, 1, chunk_size, CHB));
+    ggml_tensor * q_i = ggml_cont(ctx0, ggml_reshape_4d(ctx0, q, S_k, chunk_size, 1, CHB));
 
     ggml_tensor * decay_k_i = ggml_mul(ctx0, decay_mask, k_i);
+    ggml_tensor * decay_q_i = ggml_mul(ctx0, decay_mask, q_i);
 
     // decay_k_i [S.BT,BT,CHB] @ k_j [S,1,BT,CHB] = Akk [BT,1,BT,CHB]
-    ggml_tensor * Akk = ggml_mul_mat(ctx0, k_j, decay_k_i);
+    ggml_tensor * Akk = ggml_mul_mat(ctx0, decay_k_i, k_j);
+    ggml_tensor * Aqk = ggml_mul_mat(ctx0, decay_q_i, k_j);
     Akk = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_4d(ctx0, Akk, chunk_size, chunk_size, n_chunks, HB)));
+    Aqk = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_4d(ctx0, Aqk, chunk_size, chunk_size, n_chunks, HB)));
     cb(Akk, "Akk", il);
+    cb(Aqk, "Aqk", il);
             
     Akk = ggml_mul(ctx0, Akk, beta);
     Akk = ggml_neg(ctx0, ggml_mul(ctx0, Akk, causal_mask));
-
     cb(Akk, "attn_pre_solve", il);
 
+    Aqk = ggml_mul(ctx0, Aqk, diag_mask);
+    Aqk = ggml_scale(ctx0, Aqk, scale); // scale q
+    cb(Aqk, "Aqk_masked", il);
+
     // for i in range(1, chunk_size):
     //          row = attn[..., i, :i].clone()
     //          sub = attn[..., :i, :i].clone()
@@ -648,16 +663,14 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
 
     cb(new_state, "new_state", il);
 
-    // switch for chunkify_mask
-    decay_mask = ggml_cont(ctx0, ggml_reshape_4d(ctx0, decay_mask, S_k, chunk_size * chunk_size, n_chunks, HB));
     for (int64_t chunk = 0; chunk < n_chunks; chunk++) {
 // extract one chunk worth of data
         auto chunkify = [=](ggml_tensor * t) {
                     return ggml_cont(ctx0, ggml_view_4d(ctx0, t, t->ne[0], chunk_size, 1, t->ne[3],
                 t->nb[1], t->nb[2], t->nb[3], t->nb[2] * chunk));
         };
-        auto chunkify_mask = [=](ggml_tensor * t) {
-                    return ggml_cont(ctx0, ggml_view_4d(ctx0, t, t->ne[0], chunk_size*chunk_size, 1, t->ne[3],
+        auto chunkify_A = [=](ggml_tensor * t) {
+                    return ggml_cont(ctx0, ggml_view_4d(ctx0, t, chunk_size, chunk_size, 1, t->ne[3],
                 t->nb[1], t->nb[2], t->nb[3], t->nb[2] * chunk));
         };
 
@@ -671,27 +684,7 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
         ggml_tensor * gk_cs_chunk = chunkify(gk_cumsum);
         ggml_tensor * k_cumdecay_chunk = chunkify(k_cumdecay);
         ggml_tensor * gkexp_chunk = ggml_exp(ctx0, gk_cs_chunk);
-        ggml_tensor * decay_mask_chunk = chunkify_mask(decay_mask);
-        decay_mask_chunk = ggml_cont(ctx0, ggml_reshape_4d(ctx0, decay_mask_chunk, S_k, chunk_size, chunk_size, HB));
-/*
-    https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/kda/naive.py
-
-        for j in range(BT):
-            k_j = k[:, :, i, j]
-            g_j = g[:, :, i, j:j+1, :]
-            A[..., j] = torch.einsum('... c d, ... d -> ... c', q_i * (g_i - g_j).exp(), k_j)
-*/
-        ggml_tensor * k_j_chunk = ggml_cont(ctx0, ggml_reshape_4d(ctx0, k_chunk, S_k, 1, chunk_size, HB));
-        ggml_tensor * q_i_chunk = ggml_cont(ctx0, ggml_reshape_4d(ctx0, q_chunk, S_k, chunk_size, 1, HB));
-        ggml_tensor * decay_q_i_chunk = ggml_mul(ctx0, decay_mask_chunk, q_i_chunk);
-
-        ggml_tensor * Aqk = ggml_mul_mat(ctx0, decay_q_i_chunk, k_j_chunk);
-        Aqk = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_4d(ctx0, Aqk, chunk_size, chunk_size, 1, HB)));
-        cb(Aqk, "Aqk", il);
-
-        Aqk = ggml_mul(ctx0, Aqk, diag_mask);
-        Aqk = ggml_scale(ctx0, Aqk, scale); // scale q
-        cb(Aqk, "Aqk_masked", il);
+        ggml_tensor * Aqk_chunk = chunkify_A(Aqk);
 
         ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs);
 
@@ -712,7 +705,7 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
 
         // v_new_t [S,BT,1,H*B] Aqk [BT,BT,1,H*B]
         // core_attn_out[:, :, i] = attn_inter + attn @ v_new or A' @ (U_[t] - W_[t]*S_[t])
-        ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, Aqk);
+        ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, Aqk_chunk);
 
         // o[:, :, i] = (q_i * g_i.exp()) @ S + A @ v_i
         ggml_tensor * core_attn_out_chunk = ggml_add(ctx0, attn_inter, v_attn);

From dce064c0a3882b20d1b2a78c01cc181afa461e67 Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Sat, 10 Jan 2026 22:08:38 +0800
Subject: [PATCH 31/58] fixed typo and split wkv_b into wk_b and wv_b

---
 convert_hf_to_gguf.py          | 25 +++++++++++++++++++++++--
 gguf-py/gguf/constants.py      |  2 ++
 gguf-py/gguf/tensor_mapping.py |  4 ++--
 3 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 59ee156dd9..321930d7e6 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -5275,7 +5275,8 @@ class KimiLinearModel(TextModel):
         
         # Kimi specific bias
         if name.endswith("e_score_correction_bias"):
-             name = name.replace("e_score_correction_bias", "e_score_correction.bias")
+             new_name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_EXP_PROBS_B, bid)
+             return [(new_name, data_torch)]
 
         # process the experts separately
         if name.find("block_sparse_moe.experts") != -1:
@@ -5305,7 +5306,27 @@ class KimiLinearModel(TextModel):
                     tensors.append((new_name, data_torch))
                 return tensors
             return []
-        
+
+        # note: MLA with the absorption optimization, needs these two split and k_b_proj transposed
+        if name.endswith("kv_b_proj.weight"):
+            name_kb = name.replace("kv_b_proj", "k_b_proj")
+            name_vb = name.replace("kv_b_proj", "v_b_proj")
+
+            n_head_kv = self.hparams["num_key_value_heads"]
+            v_head_dim = self.hparams["v_head_dim"]
+            qk_nope_head_dim = self.hparams["qk_nope_head_dim"]
+
+            assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim)
+
+            kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1])
+            k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1)
+            k_b = k_b.transpose(1, 2)
+
+            return [
+                (self.map_tensor_name(name_kb), k_b),
+                (self.map_tensor_name(name_vb), v_b)
+            ]
+
         mapped_name = self.map_tensor_name(name)
         logger.info(f"Returning {mapped_name}: shape after = {tuple(data_torch.shape)}")
         return [(mapped_name, data_torch)]
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 372489ca44..8d2b54d7d5 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -3317,6 +3317,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.ATTN_Q_B,
         MODEL_TENSOR.ATTN_KV_A_MQA,
         MODEL_TENSOR.ATTN_KV_B,
+        MODEL_TENSOR.ATTN_K_B,
+        MODEL_TENSOR.ATTN_V_B,
         MODEL_TENSOR.ATTN_Q_A_NORM,
         MODEL_TENSOR.ATTN_KV_A_NORM,
         MODEL_TENSOR.FFN_NORM,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index b0d4fb1cb1..486f6a5b1d 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -403,7 +403,7 @@ class TensorNameMap:
             "model.layers.{bid}.feed_forward.expert_bias",                  # lfm2moe
             "model.layers.{bid}.block_sparse_moe.e_score_correction",       # minimax-m2
             "backbone.layers.{bid}.mixer.gate.e_score_correction"           # nemotron-h-moe
-            "model.layers.{bid}.block_sparse_moe.gate.e_score_correction",  # kimi
+            "model.layers.{bid}.block_sparse_moe.gate.e_score_correction_bias",  # kimi
         ),
 
         # Feed-forward up
@@ -812,7 +812,7 @@ class TensorNameMap:
         ),
         MODEL_TENSOR.SSM_DT_B: (
             "model.layers.{bid}.self_attn.dt_bias",
-
+        ),
         MODEL_TENSOR.TIME_MIX_W0: (
             "model.layers.{bid}.attention.w0",            # rwkv7
         ),

From b9360c7fe194e8190e1ee8b9da258699d7666e17 Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Sun, 11 Jan 2026 15:58:46 +0800
Subject: [PATCH 32/58] MLA KV cache support

---
 convert_hf_to_gguf.py      |  14 +++--
 src/llama-arch.cpp         |   2 +
 src/llama-model.cpp        |   8 ++-
 src/models/kimi-linear.cpp | 112 ++++++++++++++++++++++++++-----------
 4 files changed, 95 insertions(+), 41 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 321930d7e6..3f402a9acb 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -5118,6 +5118,9 @@ class KimiLinearModel(TextModel):
             raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!")
 
     def set_gguf_parameters(self):
+        # note: To enable MLA KV cache, attention needs to be converted into MQA (ie: GQA with 1 group)
+        self.hparams["num_key_value_heads"] = 1
+
         super().set_gguf_parameters()
         self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
         self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
@@ -5141,7 +5144,7 @@ class KimiLinearModel(TextModel):
         _full_attn_layers = linear_attn_config["full_attn_layers"]
         for il in range(self.hparams["num_hidden_layers"]):
             if il+1 in _full_attn_layers:
-                _num_kv_heads.append(linear_attn_config["num_heads"])
+                _num_kv_heads.append(self.hparams["num_key_value_heads"])
             else:
                 _num_kv_heads.append(0)
         assert(len(_num_kv_heads) == self.hparams["num_hidden_layers"])
@@ -5156,8 +5159,6 @@ class KimiLinearModel(TextModel):
         if kda_head_dim is not None:
              self.gguf_writer.add_kda_head_dim(kda_head_dim)
         
-        # MLA params - use add_* methods that handle arch substitution
-        
         # MLA params - use add_* methods that handle arch substitution
         # Support both HuggingFace naming (q_lora_rank, kv_lora_rank) and internal naming (n_lora_q, n_lora_kv)
         q_lora_rank = self.hparams.get("q_lora_rank", self.hparams.get("n_lora_q"))
@@ -5172,9 +5173,11 @@ class KimiLinearModel(TextModel):
         # Support HuggingFace naming: qk_nope_head_dim, qk_rope_head_dim, v_head_dim
         qk_nope_head_dim = self.hparams.get("qk_nope_head_dim")
         qk_rope_head_dim = self.hparams.get("qk_rope_head_dim")
-        self.gguf_writer.add_key_length(qk_nope_head_dim + qk_rope_head_dim)
         v_head_dim = self.hparams.get("v_head_dim")
-        self.gguf_writer.add_value_length(v_head_dim)
+        # To enable MLA KV cache, MLA needs to be converted into MQA with larger heads, then decompresses to MHA
+        self.gguf_writer.add_key_length(self.hparams["kv_lora_rank"] + self.hparams["qk_rope_head_dim"])
+        self.gguf_writer.add_value_length(self.hparams["kv_lora_rank"])
+
         
         # Calculate n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim
         if "n_embd_head_k_mla" in self.hparams:
@@ -5315,6 +5318,7 @@ class KimiLinearModel(TextModel):
             n_head_kv = self.hparams["num_key_value_heads"]
             v_head_dim = self.hparams["v_head_dim"]
             qk_nope_head_dim = self.hparams["qk_nope_head_dim"]
+            logger.info("Split kv_b n_head_kv %d\n" % n_head_kv)
 
             assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim)
 
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 076509ed8e..6baf3bd4da 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -2312,6 +2312,8 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                 LLM_TENSOR_ATTN_Q_A_NORM,
                 LLM_TENSOR_ATTN_KV_A_MQA,
                 LLM_TENSOR_ATTN_KV_B,
+                LLM_TENSOR_ATTN_K_B,
+                LLM_TENSOR_ATTN_V_B,
                 LLM_TENSOR_ATTN_KV_A_NORM,
             };
         default:
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 59e8d49f08..712c341fd5 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -6771,8 +6771,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                              // Note: hparams.n_rot may be 72 (from conversion) but actual is 64
                              const int64_t qk_rope_head_dim = hparams.n_rot;  // From config: qk_rope_head_dim
                              layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + qk_rope_head_dim}, 0);
-                             layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_k_mla - qk_rope_head_dim + n_embd_head_v_mla)}, 0);
-
+                             // Support Legacy GGUFs that don't split wkv_b (MLA KV cache disabled)
+                             layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_k_mla - qk_rope_head_dim + n_embd_head_v_mla)}, TENSOR_NOT_REQUIRED);
+                             if (!layer.wkv_b) { // MLA KV cache enabled
+                                 layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_k_mla - qk_rope_head_dim, kv_lora_rank, n_head}, 0);
+                                 layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
+                             }
                              layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
                         }
 
diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp
index c55116bc69..9d83ca8fa5 100644
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@@ -321,9 +321,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
             // vLLM Kimi: q = q_proj(hidden_states), then view as [n_tokens, n_head, qk_head_dim]
             // Note: Kimi MLA does NOT use RoPE (rotary_emb=None in vLLM)
             ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.wq, cur);
-            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k_mla, n_head, n_tokens);
-            cb(Qcur, "mla_Q", il);
-            
+
             // Step 2: KV compression
             // kv_cmpr_pe = kv_a_proj_with_mqa(hidden_states) -> [kv_lora_rank + qk_rope_head_dim, n_tokens]
             ggml_tensor * kv_cmpr_pe = ggml_mul_mat(ctx0, layer.wkv_a_mqa, cur);
@@ -341,37 +339,83 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
             
             // Normalize kv_c
             kv_cmpr = build_norm(kv_cmpr, layer.attn_kv_a_norm, nullptr, LLM_NORM_RMS, il);
-            
-            // KV decompression: kv = kv_b_proj(kv_c_normed)
-            ggml_tensor * kv = ggml_mul_mat(ctx0, layer.wkv_b, kv_cmpr);
-            const int64_t kv_per_head = n_embd_head_qk_nope + n_embd_head_v_mla;
-            
-            // Split kv into k_nope and v
-            ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
-                ggml_row_size(kv->type, kv_per_head),
-                ggml_row_size(kv->type, kv_per_head * n_head), 0);
-            ggml_tensor * Vcur = ggml_view_3d(ctx0, kv, n_embd_head_v_mla, n_head, n_tokens,
-                ggml_row_size(kv->type, kv_per_head),
-                ggml_row_size(kv->type, kv_per_head * n_head),
-                ggml_row_size(kv->type, n_embd_head_qk_nope));
-            k_nope = ggml_cont(ctx0, k_nope);
-            Vcur = ggml_cont(ctx0, Vcur);
-            cb(Vcur, "mla_V", il);
-            
-            // Concatenate k_nope + k_pe (broadcast k_pe to all heads)
-            // K = [k_nope, k_pe] where k_nope is [qk_nope_head_dim, n_head, n_tokens]
-            // and k_pe is [qk_rope_head_dim, 1, n_tokens] broadcast to all heads
-            // Need to broadcast k_pe from [qk_rope, 1, n_tokens] to [qk_rope, n_head, n_tokens]
-            ggml_tensor * k_pe_target = ggml_new_tensor_3d(ctx0, k_pe->type, n_embd_head_qk_rope, n_head, n_tokens);
-            ggml_tensor * k_pe_repeated = ggml_repeat(ctx0, k_pe, k_pe_target);
-            ggml_tensor * Kcur = ggml_concat(ctx0, k_nope, k_pe_repeated, 0);
-            cb(Kcur, "mla_K", il);
-            
-            // Direct softmax attention (with KV cache)
-            // Use build_attn with inp_attn for proper mask handling
-            cur = build_attn(inp_attn, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il);
-            cb(cur, "mla_out", il);
-            
+
+            if (layer.wk_b && layer.wv_b) { // MLA KV cache enabled
+                // extract q_nope
+                ggml_tensor * q_nope =
+                    ggml_view_3d(ctx0, Qcur, n_embd_head_qk_nope, n_head, n_tokens, ggml_row_size(Qcur->type, n_embd_head_k_mla),
+                                 ggml_row_size(Qcur->type, n_embd_head_k_mla) * n_head, 0);
+                cb(q_nope, "q_nope", il);
+    
+                // and {n_embd_head_qk_rope, n_head, n_tokens}
+                ggml_tensor * q_pe = ggml_view_3d(
+                    ctx0, Qcur, n_embd_head_qk_rope, n_head, n_tokens, ggml_row_size(Qcur->type, n_embd_head_k_mla),
+                    ggml_row_size(Qcur->type, n_embd_head_k_mla) * n_head, ggml_row_size(Qcur->type, n_embd_head_qk_nope));
+                cb(q_pe, "q_pe", il);
+
+                // {n_embd_head_qk_nope, n_tokens, n_head}
+                q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
+                cb(q_nope, "q_nope_perm", il);
+
+                // {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
+                ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, layer.wk_b, q_nope);
+                cb(q_nope_absorbed, "q_nope_absorbed", il);
+
+                // {kv_lora_rank, n_head, n_tokens}
+                q_nope_absorbed = ggml_permute(ctx0, q_nope_absorbed, 0, 2, 1, 3);
+                cb(q_nope_absorbed, "q_nope_absorbed_perm", il);
+
+                // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
+                // note: rope must go first for in-place context shifting in build_rope_shift()
+                Qcur = ggml_concat(ctx0, q_pe, q_nope_absorbed, 0);
+                cb(Qcur, "Qcur", il);
+
+                kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens);
+                cb(kv_cmpr, "kv_cmpr_reshape", il);
+
+                // {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
+                ggml_tensor * Kcur = ggml_concat(ctx0, k_pe, kv_cmpr, 0);
+                cb(Kcur, "Kcur", il);
+
+                // {kv_lora_rank, 1, n_tokens}
+                ggml_tensor * Vcur = kv_cmpr;
+                cb(Vcur, "Vcur", il);
+
+                cur = build_attn(inp_attn, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, layer.wv_b, kq_scale_mla, il);
+                cb(cur, "mla_out", il);
+            } else { // MLA KV cache disabled. Fall back to MHA KV cache.
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k_mla, n_head, n_tokens);
+                cb(Qcur, "mla_Q", il);
+                // KV decompression: kv = kv_b_proj(kv_c_normed)
+                ggml_tensor * kv = ggml_mul_mat(ctx0, layer.wkv_b, kv_cmpr);
+                const int64_t kv_per_head = n_embd_head_qk_nope + n_embd_head_v_mla;
+                
+                // Split kv into k_nope and v
+                ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
+                    ggml_row_size(kv->type, kv_per_head),
+                    ggml_row_size(kv->type, kv_per_head * n_head), 0);
+                ggml_tensor * Vcur = ggml_view_3d(ctx0, kv, n_embd_head_v_mla, n_head, n_tokens,
+                    ggml_row_size(kv->type, kv_per_head),
+                    ggml_row_size(kv->type, kv_per_head * n_head),
+                    ggml_row_size(kv->type, n_embd_head_qk_nope));
+                k_nope = ggml_cont(ctx0, k_nope);
+                Vcur = ggml_cont(ctx0, Vcur);
+                cb(Vcur, "mla_V", il);
+                
+                // Concatenate k_nope + k_pe (broadcast k_pe to all heads)
+                // K = [k_nope, k_pe] where k_nope is [qk_nope_head_dim, n_head, n_tokens]
+                // and k_pe is [qk_rope_head_dim, 1, n_tokens] broadcast to all heads
+                // Need to broadcast k_pe from [qk_rope, 1, n_tokens] to [qk_rope, n_head, n_tokens]
+                ggml_tensor * k_pe_target = ggml_new_tensor_3d(ctx0, k_pe->type, n_embd_head_qk_rope, n_head, n_tokens);
+                ggml_tensor * k_pe_repeated = ggml_repeat(ctx0, k_pe, k_pe_target);
+                ggml_tensor * Kcur = ggml_concat(ctx0, k_nope, k_pe_repeated, 0);
+                cb(Kcur, "mla_K", il);
+    
+                // Direct softmax attention (with MHA KV cache)
+                // Use build_attn with inp_attn for proper mask handling
+                cur = build_attn(inp_attn, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il);
+                cb(cur, "mla_out", il);
+            }    
         } else {
             // Unknown layer type - this should not happen
             GGML_ABORT("Kimi layer is neither KDA nor MLA - missing required tensors");

From 6ae66fc40dcbd9562ef71ebe5cd3a7bc9686e385 Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Sun, 11 Jan 2026 21:31:35 +0800
Subject: [PATCH 33/58] fix trailing spaces

---
 gguf-py/gguf/tensor_mapping.py |  4 +-
 src/llama-vocab.cpp            |  4 +-
 src/models/kimi-linear.cpp     | 82 ++++++++++++++++------------------
 3 files changed, 42 insertions(+), 48 deletions(-)

diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 88e2caf541..c4957a7b20 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -819,13 +819,13 @@ class TensorNameMap:
         # Kimi Linear KDA (using SSM_ prefix for consistency)
         MODEL_TENSOR.SSM_CONV1D_Q: (
             "model.layers.{bid}.self_attn.q_conv1d",
-        ),  
+        ),
         MODEL_TENSOR.SSM_CONV1D_K: (
             "model.layers.{bid}.self_attn.k_conv1d",
         ),
         MODEL_TENSOR.SSM_CONV1D_V: (
             "model.layers.{bid}.self_attn.v_conv1d",
-        ),  
+        ),
         MODEL_TENSOR.SSM_F_A: (
             "model.layers.{bid}.self_attn.f_a_proj",
         ),
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index eaa574f3b8..f7a264dc60 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1747,7 +1747,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
             // Kimi-K2 uses custom tokenization without traditional BPE merges
             const bool is_kimi_k2 = (tokenizer_pre == "kimi-k2");
-            
+
             if (merges_keyidx == -1) {
                 if (!is_kimi_k2) {
                     throw std::runtime_error("cannot find tokenizer merges in model file\n");
@@ -1768,7 +1768,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                     if (pos != std::string::npos) {
                         first  = word.substr(0, pos);
                         second = word.substr(pos + 1);
-                    } 
+                    }
 
                     bpe_ranks.emplace(std::make_pair(first, second), i);
                 }
diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp
index 9d83ca8fa5..e873024c90 100644
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@@ -12,7 +12,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
 
     // Note: Kimi MLA does NOT use RoPE (rotary_emb=None in vLLM)
     // So we don't need inp_pos
-    
+
     auto * inp = build_inp_mem_hybrid();
     auto * inp_rs = inp->get_recr();
     auto * inp_attn = inp->get_attn();
@@ -38,12 +38,12 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
     const int64_t d_inner = n_head * head_dim;  // 32 * 128 = 4096
     const int64_t n_seqs = ubatch.n_seqs;
     const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-    
+
     // Verify batch consistency for recurrent layers
     GGML_ASSERT(n_seqs != 0);
     GGML_ASSERT(ubatch.equal_seqs());
     GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
-    
+
     // MLA params
     const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla;
     const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla;
@@ -67,14 +67,13 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
         // KDA layers have ssm_a_log tensor, MLA layers have wkv_a_mqa tensor
         bool is_kda = (layer.ssm_a_log != nullptr);
         bool is_mla = (layer.wkv_a_mqa != nullptr);
-        
+
         if (is_kda) {
             // === KDA Layer (Kimi Delta Attention) with Recurrent State ===
             // Reference: vLLM kda.py
-            
             const auto * mctx_cur = inp_rs->mctx;
             const auto kv_head = mctx_cur->get_head();
-            
+
             // Get conv states from r_l tensor (Q, K, V each have separate state)
             ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
             cb(conv_states_all, "conv_states_all", il);
@@ -85,7 +84,6 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
             // Each conv state is [(d_conv-1) * d_inner] per sequence, need to reshape to [d_conv-1, d_inner, n_seqs]
             // Memory layout: for each seq, Q state is first conv_state_size elements, then K, then V
             // conv_state_all has stride: nb[0] = element_size, nb[1] = n_embd_r_total * element_size
-            
             // View Q conv state: offset 0, size conv_state_size per seq
             // conv_state_all is [n_embd_r_total, n_seqs] with memory layout:
             //   state[i + seq * n_embd_r_total] where i = conv_step + channel * (d_conv-1) + {0, conv_state_size, 2*conv_state_size} for Q/K/V
@@ -104,7 +102,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
                 (d_conv - 1) * ggml_element_size(conv_state_all),
                 n_embd_r_total * ggml_element_size(conv_state_all),
                 2 * conv_state_size * ggml_element_size(conv_state_all));  // offset for V
-            
+
             // Step 1: Q, K, V projections -> [d_inner, n_tokens]
             ggml_tensor * q_proj = ggml_mul_mat(ctx0, layer.wq, cur);
             ggml_tensor * k_proj = ggml_mul_mat(ctx0, layer.wk, cur);
@@ -112,14 +110,14 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
             cb(q_proj, "kda_q_proj", il);
             cb(k_proj, "kda_k_proj", il);
             cb(v_proj, "kda_v_proj", il);
-            
+
             // Step 2: Causal Conv1d for Q
             // Reshape input: {d_inner, n_tokens} -> {d_inner, n_seq_tokens, n_seqs}
             ggml_tensor * q_3d = ggml_reshape_3d(ctx0, q_proj, d_inner, n_seq_tokens, n_seqs);
-            
+
             // Concat Q conv state and current input: {d_conv-1 + n_seq_tokens, d_inner, n_seqs}
             ggml_tensor * conv_q = ggml_concat(ctx0, conv_state_q, ggml_transpose(ctx0, q_3d), 0);
-            
+
             // Save last (d_conv-1) columns back to Q conv state
             ggml_tensor * last_conv_q = ggml_view_3d(ctx0, conv_q, d_conv - 1, d_inner, n_seqs,
                 conv_q->nb[1], conv_q->nb[2], n_seq_tokens * conv_q->nb[0]);
@@ -127,7 +125,6 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
                 ggml_cpy(ctx0, last_conv_q,
                     ggml_view_1d(ctx0, conv_states_all, conv_state_size * n_seqs,
                         kv_head * n_embd_r_total * ggml_element_size(conv_states_all))));
-            
             // Reshape conv weight: GGUF [d_conv, 1, d_inner, 1] -> ggml_ssm_conv expects [d_conv, d_inner]
             // GGUF stores as [d_conv, 1, d_inner, 1] with memory layout w[conv_step + channel * d_conv]
             // vLLM stores as [d_inner, d_conv] with memory layout w[channel * d_conv + conv_step]
@@ -143,13 +140,13 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
                 }
                 conv_weight = ggml_reshape_2d(ctx0, q_conv_f32, d_conv, d_inner);
             }
-            
+
             // Apply conv1d
             ggml_tensor * Qcur;
             if (conv_weight) {
                 // Make conv_q contiguous for ggml_ssm_conv
                 conv_q = ggml_cont(ctx0, conv_q);
-                
+
                 // ggml_ssm_conv output: {d_inner, n_seq_tokens, n_seqs}
                 Qcur = ggml_ssm_conv(ctx0, conv_q, conv_weight);
                 cb(Qcur, "Q conv1d", il);
@@ -163,13 +160,13 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
             } else {
                 GGML_ABORT("KDA layer missing Q conv weight");
             }
-            
+
             // K conv1d (with separate K conv state)
             ggml_tensor * Kcur;
             if (layer.ssm_k_conv) {
                 ggml_tensor * k_3d = ggml_reshape_3d(ctx0, k_proj, d_inner, n_seq_tokens, n_seqs);
                 ggml_tensor * conv_k = ggml_cont(ctx0, ggml_concat(ctx0, conv_state_k, ggml_transpose(ctx0, k_3d), 0));
-                
+
                 // Save K conv state
                 ggml_tensor * last_conv_k = ggml_view_3d(ctx0, conv_k, d_conv - 1, d_inner, n_seqs,
                     conv_k->nb[1], conv_k->nb[2], n_seq_tokens * conv_k->nb[0]);
@@ -177,7 +174,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
                     ggml_cpy(ctx0, last_conv_k,
                         ggml_view_1d(ctx0, conv_states_all, conv_state_size * n_seqs,
                             (kv_head * n_embd_r_total + conv_state_size) * ggml_element_size(conv_states_all))));
-                
+
                 ggml_tensor * k_conv_f32 = layer.ssm_k_conv;
                 if (k_conv_f32->type != GGML_TYPE_F32) {
                     k_conv_f32 = ggml_cast(ctx0, k_conv_f32, GGML_TYPE_F32);
@@ -194,13 +191,13 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
             } else {
                 GGML_ABORT("KDA layer missing K conv weight");
             }
-            
+
             // V conv1d (with separate V conv state)
             ggml_tensor * Vcur;
             if (layer.ssm_v_conv) {
                 ggml_tensor * v_3d = ggml_reshape_3d(ctx0, v_proj, d_inner, n_seq_tokens, n_seqs);
                 ggml_tensor * conv_v = ggml_cont(ctx0, ggml_concat(ctx0, conv_state_v, ggml_transpose(ctx0, v_3d), 0));
-                
+
                 // Save V conv state
                 ggml_tensor * last_conv_v = ggml_view_3d(ctx0, conv_v, d_conv - 1, d_inner, n_seqs,
                     conv_v->nb[1], conv_v->nb[2], n_seq_tokens * conv_v->nb[0]);
@@ -208,7 +205,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
                     ggml_cpy(ctx0, last_conv_v,
                         ggml_view_1d(ctx0, conv_states_all, conv_state_size * n_seqs,
                             (kv_head * n_embd_r_total + 2 * conv_state_size) * ggml_element_size(conv_states_all))));
-                
+
                 ggml_tensor * v_conv_f32 = layer.ssm_v_conv;
                 if (v_conv_f32->type != GGML_TYPE_F32) {
                     v_conv_f32 = ggml_cast(ctx0, v_conv_f32, GGML_TYPE_F32);
@@ -225,7 +222,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
             } else {
                 GGML_ABORT("KDA layer missing V conv weight");
             }
-            
+
             // Step 3: Compute g1 (forget gate)
             // g1 = -exp(A_log) * softplus(f_b(f_a(x)) + dt_bias)
             ggml_tensor * f_a = ggml_mul_mat(ctx0, layer.ssm_f_a, cur);
@@ -234,7 +231,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
             g1 = ggml_add(ctx0, g1, layer.ssm_dt_b);
             g1 = ggml_softplus(ctx0, g1);
             g1 = ggml_reshape_3d(ctx0, g1, head_dim, n_head, n_tokens);
-            
+
             // A_log shape is [1, n_head] or [1, n_head, 1, 1], need to broadcast to [head_dim, n_head, n_tokens]
             // First compute -exp(A_log), then reshape for broadcasting
             ggml_tensor * A_neg_exp = ggml_neg(ctx0, ggml_exp(ctx0, layer.ssm_a_log));
@@ -242,16 +239,16 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
             A_neg_exp = ggml_reshape_3d(ctx0, A_neg_exp, 1, n_head, 1);
             g1 = ggml_mul(ctx0, g1, A_neg_exp);
             cb(g1, "kda_g1", il);
-            
+
             // Step 4: Compute beta (mixing coefficient)
             ggml_tensor * beta = ggml_mul_mat(ctx0, layer.ssm_beta, cur);
             beta = ggml_cont_4d(ctx0, beta, n_head, 1, n_seq_tokens, n_seqs);
             cb(beta, "kda_beta", il);
-            
+
             // Step 5: Reshape for KDA recurrence
             // {n_embd, n_tokens} -> {n_embd, n_seq_tokens, n_seqs}
             cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
-            
+
             Qcur = ggml_cont(ctx0, ggml_reshape_4d(ctx0, Qcur, head_dim, n_head, n_seq_tokens, n_seqs));
             Kcur = ggml_cont(ctx0, ggml_reshape_4d(ctx0, Kcur, head_dim, n_head, n_seq_tokens, n_seqs));
             Vcur = ggml_cont(ctx0, ggml_reshape_4d(ctx0, Vcur, head_dim, n_head, n_seq_tokens, n_seqs));
@@ -274,7 +271,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
            const int64_t output_flat_size = head_dim * n_head * n_seq_tokens * n_seqs;
            ggml_tensor * attn_out_1d      = ggml_view_1d(ctx0, attn_out, output_flat_size, 0);
            cb(attn_out_1d, "attn_out_1d", il);
-        
+
            ggml_tensor * attn_out_final = ggml_reshape_3d(ctx0, attn_out_1d, head_dim, n_head, n_seq_tokens * n_seqs);
            cb(attn_out_final, "attn_out_reshaped", il);
            // Extract the state part (second part of the concatenated tensor)
@@ -299,7 +296,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
             ggml_tensor * g2 = ggml_mul_mat(ctx0, layer.ssm_g_b, g_a);
             cb(g2, "g2 g_b(g_a(cur_2d))", il);
             g2 = ggml_reshape_3d(ctx0, g2, head_dim, n_head, n_seq_tokens * n_seqs);
-            
+
             // Step 8: Apply o_norm with sigmoid gating
             // Note: Kimi model uses sigmoid gating, not SiLU (despite FusedRMSNormGated default being swish)
             // Formula: output = RMSNorm(x) * sigmoid(g)
@@ -307,7 +304,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
             cb(normed, "kda_normed", il);
             ggml_tensor * gate = ggml_sigmoid(ctx0, g2);
             ggml_tensor * gated = ggml_mul(ctx0, normed, gate);
-            
+
             // Step 9: Output projection
             gated = ggml_cont_2d(ctx0, gated, d_inner, n_tokens);
             cur = ggml_mul_mat(ctx0, layer.wo, gated);
@@ -316,7 +313,6 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
         } else if (is_mla) {
             // === MLA Layer (Multi-head Latent Attention) without KV Cache ===
             // Reference: vLLM mla.py
-            
             // Step 1: Q projection and reshape
             // vLLM Kimi: q = q_proj(hidden_states), then view as [n_tokens, n_head, qk_head_dim]
             // Note: Kimi MLA does NOT use RoPE (rotary_emb=None in vLLM)
@@ -325,7 +321,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
             // Step 2: KV compression
             // kv_cmpr_pe = kv_a_proj_with_mqa(hidden_states) -> [kv_lora_rank + qk_rope_head_dim, n_tokens]
             ggml_tensor * kv_cmpr_pe = ggml_mul_mat(ctx0, layer.wkv_a_mqa, cur);
-            
+
             // Split: kv_cmpr = kv_lora[:kv_lora_rank], k_pe = kv_lora[kv_lora_rank:]
             ggml_tensor * kv_cmpr = ggml_view_2d(ctx0, kv_cmpr_pe, kv_lora_rank, n_tokens,
                 ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope), 0);
@@ -333,10 +329,8 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
                 ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
                 ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
                 ggml_row_size(kv_cmpr_pe->type, kv_lora_rank));
-            
             // Note: Kimi MLA does NOT apply RoPE (rotary_emb=None in vLLM)
             // k_pe is used directly without RoPE
-            
             // Normalize kv_c
             kv_cmpr = build_norm(kv_cmpr, layer.attn_kv_a_norm, nullptr, LLM_NORM_RMS, il);
 
@@ -346,7 +340,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
                     ggml_view_3d(ctx0, Qcur, n_embd_head_qk_nope, n_head, n_tokens, ggml_row_size(Qcur->type, n_embd_head_k_mla),
                                  ggml_row_size(Qcur->type, n_embd_head_k_mla) * n_head, 0);
                 cb(q_nope, "q_nope", il);
-    
+
                 // and {n_embd_head_qk_rope, n_head, n_tokens}
                 ggml_tensor * q_pe = ggml_view_3d(
                     ctx0, Qcur, n_embd_head_qk_rope, n_head, n_tokens, ggml_row_size(Qcur->type, n_embd_head_k_mla),
@@ -389,7 +383,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
                 // KV decompression: kv = kv_b_proj(kv_c_normed)
                 ggml_tensor * kv = ggml_mul_mat(ctx0, layer.wkv_b, kv_cmpr);
                 const int64_t kv_per_head = n_embd_head_qk_nope + n_embd_head_v_mla;
-                
+
                 // Split kv into k_nope and v
                 ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
                     ggml_row_size(kv->type, kv_per_head),
@@ -401,7 +395,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
                 k_nope = ggml_cont(ctx0, k_nope);
                 Vcur = ggml_cont(ctx0, Vcur);
                 cb(Vcur, "mla_V", il);
-                
+
                 // Concatenate k_nope + k_pe (broadcast k_pe to all heads)
                 // K = [k_nope, k_pe] where k_nope is [qk_nope_head_dim, n_head, n_tokens]
                 // and k_pe is [qk_rope_head_dim, 1, n_tokens] broadcast to all heads
@@ -410,7 +404,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
                 ggml_tensor * k_pe_repeated = ggml_repeat(ctx0, k_pe, k_pe_target);
                 ggml_tensor * Kcur = ggml_concat(ctx0, k_nope, k_pe_repeated, 0);
                 cb(Kcur, "mla_K", il);
-    
+
                 // Direct softmax attention (with MHA KV cache)
                 // Use build_attn with inp_attn for proper mask handling
                 cur = build_attn(inp_attn, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il);
@@ -420,13 +414,13 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
             // Unknown layer type - this should not happen
             GGML_ABORT("Kimi layer is neither KDA nor MLA - missing required tensors");
         }
-        
+
         // On last layer, select only the output tokens
         if (il == n_layer - 1 && inp_out_ids) {
             cur   = ggml_get_rows(ctx0, cur,   inp_out_ids);
             inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
         }
-        
+
         // Residual
         ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
         cb(ffn_inp, "ffn_inp", il);
@@ -459,7 +453,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
                 (llama_expert_gating_func_type) hparams.expert_gating_func,
                 il);
             cb(moe_out, "ffn_moe_out", il);
-            
+
             // Shared expert
             {
                 ggml_tensor * ffn_shexp = build_ffn(cur,
@@ -468,7 +462,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
                         layer.ffn_down_shexp, NULL, NULL,
                         NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
                 cb(ffn_shexp, "ffn_shexp", il);
-                
+
                 cur = ggml_add(ctx0, moe_out, ffn_shexp);
                 cb(cur, "ffn_out", il);
             }
@@ -663,7 +657,7 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
     Aqk = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_4d(ctx0, Aqk, chunk_size, chunk_size, n_chunks, HB)));
     cb(Akk, "Akk", il);
     cb(Aqk, "Aqk", il);
-            
+
     Akk = ggml_mul(ctx0, Akk, beta);
     Akk = ggml_neg(ctx0, ggml_mul(ctx0, Akk, causal_mask));
     cb(Akk, "attn_pre_solve", il);
@@ -798,15 +792,15 @@ ggml_tensor * llm_build_kimi_linear::build_kda_autoregressive(
     ggml_tensor * v,
     ggml_tensor * gk,
     ggml_tensor * beta,
-    ggml_tensor * state, 
+    ggml_tensor * state,
     int il) {
     GGML_ASSERT(ggml_is_contiguous(q));
     GGML_ASSERT(ggml_is_contiguous(k));
-    GGML_ASSERT(ggml_is_contiguous(v)); 
+    GGML_ASSERT(ggml_is_contiguous(v));
     GGML_ASSERT(ggml_is_contiguous(gk));
     GGML_ASSERT(ggml_is_contiguous(beta));
     GGML_ASSERT(ggml_is_contiguous(state));
-           
+
     const int64_t S_k      = q->ne[0];
     const int64_t H_k      = q->ne[1];
     const int64_t n_tokens = q->ne[2];

From 93afbedc96ae17a5a68eef5c1f7202f0e9c2c949 Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Sun, 11 Jan 2026 21:44:54 +0800
Subject: [PATCH 34/58] moved const llama_model & model; around to follow
 qwen3next format and see if it cna pass the -Wunused-private-field error

---
 src/models/kimi-linear.cpp | 3 ++-
 src/models/models.h        | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp
index e873024c90..62f83e3ea5 100644
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@@ -3,7 +3,8 @@
 
 #define CHUNK_SIZE 64
 
-llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params), model(model) {
+llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const llm_graph_params & params) :
+    llm_graph_context_mamba(params), model(model) {
     ggml_tensor * cur;
     ggml_tensor * inpL;
 
diff --git a/src/models/models.h b/src/models/models.h
index 3ed00aae32..549329e15a 100644
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -287,7 +287,6 @@ struct llm_build_jamba : public llm_graph_context_mamba {
 struct llm_build_kimi_linear : public llm_graph_context_mamba {
     llm_build_kimi_linear(const llama_model & model, const llm_graph_params & params);
 private:
-    const llama_model & model;
     ggml_tensor * build_kda_autoregressive(
                 ggml_tensor * q,
                 ggml_tensor * k,
@@ -308,6 +307,8 @@ private:
                 ggml_tensor * identity,
                 ggml_tensor * diag_mask,
                         int   il);
+
+    const llama_model & model;
 };
 
 struct llm_build_lfm2 : public llm_graph_context {

From 59182f5e06e4a8c394c7b62805679e66ee2de3fc Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Sun, 11 Jan 2026 22:06:48 +0800
Subject: [PATCH 35/58] fix trailing whitespace

---
 src/models/kimi-linear.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp
index 62f83e3ea5..b0330e23b3 100644
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@@ -410,7 +410,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
                 // Use build_attn with inp_attn for proper mask handling
                 cur = build_attn(inp_attn, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il);
                 cb(cur, "mla_out", il);
-            }    
+            }
         } else {
             // Unknown layer type - this should not happen
             GGML_ABORT("Kimi layer is neither KDA nor MLA - missing required tensors");
@@ -628,7 +628,7 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
         A[..., j] = torch.einsum('... c d, ... d -> ... c', q_i * (g_i - g_j).exp(), k_j)
 */
     const int64_t CHB = n_chunks * H_k * n_seqs;
-    ggml_tensor * gkcs_i = ggml_reshape_4d(ctx0, gk_cumsum, chunk_size, 1, S_k, CHB);  // [chunk_size, 1, S_k, CHB] 
+    ggml_tensor * gkcs_i = ggml_reshape_4d(ctx0, gk_cumsum, chunk_size, 1, S_k, CHB);  // [chunk_size, 1, S_k, CHB]
     ggml_tensor * gkcs_j = ggml_reshape_4d(ctx0, gkcs_i, 1, chunk_size, S_k, CHB);  // [1, chunk_size, S_k, CHB]
 
     ggml_tensor * gkcs_j_bc = ggml_repeat_4d(ctx0, gkcs_j, chunk_size, chunk_size, S_k, CHB);  // [1, chunk_size, S_k, CHB] -> [chunk_size, chunk_size, S_k, CHB]
@@ -787,7 +787,7 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
     return ggml_concat(ctx0, flat_output, flat_state, 0);
 }
 
-ggml_tensor * llm_build_kimi_linear::build_kda_autoregressive( 
+ggml_tensor * llm_build_kimi_linear::build_kda_autoregressive(
     ggml_tensor * q,
     ggml_tensor * k,
     ggml_tensor * v,

From 58d1ee52276bd51f9fe3ec0edcce1d5403910f9f Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Sun, 11 Jan 2026 22:19:29 +0800
Subject: [PATCH 36/58] removed traling whitespaces in empty line + make sure
 indentation is multiple of 4

---
 convert_hf_to_gguf.py | 61 +++++++++++++++++++++----------------------
 1 file changed, 30 insertions(+), 31 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 9272cc28a6..e99deeeb44 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -5138,7 +5138,7 @@ class KimiLinearModel(TextModel):
             # Default to 4096 if not found
             logger.warning("No context length found in config, defaulting to 4096")
             self.gguf_writer.add_context_length(4096)
-        
+
         # KDA & MLA params
         # Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv
         linear_attn_config = self.hparams.get("linear_attn_config", {})
@@ -5156,23 +5156,23 @@ class KimiLinearModel(TextModel):
 
         ssm_d_conv = self.hparams.get("ssm_d_conv") or linear_attn_config.get("short_conv_kernel_size")
         if ssm_d_conv is not None:
-             self.gguf_writer.add_ssm_conv_kernel(ssm_d_conv)
+            self.gguf_writer.add_ssm_conv_kernel(ssm_d_conv)
 
         kda_head_dim = self.hparams.get("kda_head_dim") or linear_attn_config.get("head_dim")
 
         if kda_head_dim is not None:
-             self.gguf_writer.add_kda_head_dim(kda_head_dim)
-        
+            self.gguf_writer.add_kda_head_dim(kda_head_dim)
+
         # MLA params - use add_* methods that handle arch substitution
         # Support both HuggingFace naming (q_lora_rank, kv_lora_rank) and internal naming (n_lora_q, n_lora_kv)
         q_lora_rank = self.hparams.get("q_lora_rank", self.hparams.get("n_lora_q"))
         kv_lora_rank = self.hparams.get("kv_lora_rank", self.hparams.get("n_lora_kv"))
-        
+
         if q_lora_rank is not None:
-             self.gguf_writer.add_q_lora_rank(q_lora_rank)
+            self.gguf_writer.add_q_lora_rank(q_lora_rank)
         if kv_lora_rank is not None:
-             self.gguf_writer.add_kv_lora_rank(kv_lora_rank)
-        
+            self.gguf_writer.add_kv_lora_rank(kv_lora_rank)
+
         # MLA head dimensions
         # Support HuggingFace naming: qk_nope_head_dim, qk_rope_head_dim, v_head_dim
         qk_nope_head_dim = self.hparams.get("qk_nope_head_dim")
@@ -5182,28 +5182,27 @@ class KimiLinearModel(TextModel):
         self.gguf_writer.add_key_length(self.hparams["kv_lora_rank"] + self.hparams["qk_rope_head_dim"])
         self.gguf_writer.add_value_length(self.hparams["kv_lora_rank"])
 
-        
         # Calculate n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim
         if "n_embd_head_k_mla" in self.hparams:
-             self.gguf_writer.add_key_length_mla(self.hparams["n_embd_head_k_mla"])
+            self.gguf_writer.add_key_length_mla(self.hparams["n_embd_head_k_mla"])
         elif qk_nope_head_dim is not None and qk_rope_head_dim is not None:
-             n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim
-             self.gguf_writer.add_key_length_mla(n_embd_head_k_mla)
-        
+            n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim
+            self.gguf_writer.add_key_length_mla(n_embd_head_k_mla)
+
         # n_embd_head_v_mla = v_head_dim
         if "n_embd_head_v_mla" in self.hparams:
-             self.gguf_writer.add_value_length_mla(self.hparams["n_embd_head_v_mla"])
+            self.gguf_writer.add_value_length_mla(self.hparams["n_embd_head_v_mla"])
         elif v_head_dim is not None:
-             self.gguf_writer.add_value_length_mla(v_head_dim)
-        
+            self.gguf_writer.add_value_length_mla(v_head_dim)
+
         # Rotation - use qk_rope_head_dim for Kimi
         rope_dim = self.hparams.get("qk_rope_head_dim") or self.hparams.get("n_rot")
         if rope_dim is not None:
-             self.gguf_writer.add_rope_dimension_count(rope_dim)
+            self.gguf_writer.add_rope_dimension_count(rope_dim)
         else:
-             # Default to head_dim
-             head_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
-             self.gguf_writer.add_rope_dimension_count(head_dim)
+            # Default to head_dim
+            head_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
+            self.gguf_writer.add_rope_dimension_count(head_dim)
 
         # Copied from Qwen2Moe as this model inherits parts of it
         # YaRN is not enabled by default
@@ -5227,17 +5226,17 @@ class KimiLinearModel(TextModel):
         moe_intermediate_size = self.hparams.get("moe_intermediate_size")
         if moe_intermediate_size is not None:
             self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
-        
+
         # num_shared_experts (1 for Kimi)
         num_shared_experts = self.hparams.get("num_shared_experts")
         if num_shared_experts is not None:
             self.gguf_writer.add_expert_shared_count(num_shared_experts)
-        
+
         # first_k_dense_replace (1 for Kimi - first layer uses dense MLP)
         first_k_dense_replace = self.hparams.get("first_k_dense_replace")
         if first_k_dense_replace is not None:
             self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace)
-        
+
         # Routed scaling factor (expert_weights_scale = 2.446 for Kimi)
         routed_scaling_factor = self.hparams.get("routed_scaling_factor")
         if routed_scaling_factor is not None:
@@ -5246,13 +5245,13 @@ class KimiLinearModel(TextModel):
     def prepare_tensors(self):
         super().prepare_tensors()
         if self._experts is not None:
-             experts = [k for d in self._experts for k in d.keys()]
-             if len(experts) > 0:
-                 raise ValueError(f"Unprocessed experts: {experts}")
+            experts = [k for d in self._experts for k in d.keys()]
+            if len(experts) > 0:
+                raise ValueError(f"Unprocessed experts: {experts}")
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         logger.info(f"Processing {name}: shape before = {tuple(data_torch.shape)}")
-        
+
         # Handle KDA conv1d weights
         # HuggingFace/vLLM stores as [d_inner, d_conv] (2D), memory layout: conv_step changes fastest
         # llama.cpp expects ggml ne = [d_conv, 1, d_inner, 1], memory layout: ne[0]=d_conv changes fastest
@@ -5271,7 +5270,7 @@ class KimiLinearModel(TextModel):
                 d_inner, _, d_conv = data_torch.shape
                 data_torch = data_torch.reshape(1, d_inner, 1, d_conv)
                 logger.info(f"Reshaped conv1d weight {name}: [d_inner={d_inner}, 1, d_conv={d_conv}] -> numpy {tuple(data_torch.shape)} -> ggml ne=[{d_conv}, 1, {d_inner}, 1]")
-        
+
         # Handle A_log: HF stores as [1, 1, num_heads, 1]
         # llama.cpp expects ggml ne = [1, num_heads, 1, 1]
         # GGUF reverses numpy shape: numpy (1, 1, num_heads, 1) -> ggml ne = [1, num_heads, 1, 1]
@@ -5279,11 +5278,11 @@ class KimiLinearModel(TextModel):
         if name.endswith(".A_log"):
             if data_torch.ndim == 4:
                 logger.info(f"A_log {name}: numpy {tuple(data_torch.shape)} -> ggml ne={list(reversed(data_torch.shape))}")
-        
+
         # Kimi specific bias
         if name.endswith("e_score_correction_bias"):
-             new_name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_EXP_PROBS_B, bid)
-             return [(new_name, data_torch)]
+            new_name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_EXP_PROBS_B, bid)
+            return [(new_name, data_torch)]
 
         # process the experts separately
         if name.find("block_sparse_moe.experts") != -1:

From 4f6ef2c0858e545ffc81375a1d857c09ad19fa65 Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Sun, 11 Jan 2026 22:33:58 +0800
Subject: [PATCH 37/58] try to make lint happy

---
 convert_hf_to_gguf.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index e99deeeb44..a6d2810359 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -5159,7 +5159,6 @@ class KimiLinearModel(TextModel):
             self.gguf_writer.add_ssm_conv_kernel(ssm_d_conv)
 
         kda_head_dim = self.hparams.get("kda_head_dim") or linear_attn_config.get("head_dim")
-
         if kda_head_dim is not None:
             self.gguf_writer.add_kda_head_dim(kda_head_dim)
 
@@ -5328,11 +5327,7 @@ class KimiLinearModel(TextModel):
             kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1])
             k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1)
             k_b = k_b.transpose(1, 2)
-
-            return [
-                (self.map_tensor_name(name_kb), k_b),
-                (self.map_tensor_name(name_vb), v_b)
-            ]
+            return [(self.map_tensor_name(name_kb), k_b), (self.map_tensor_name(name_vb), v_b)]
 
         mapped_name = self.map_tensor_name(name)
         logger.info(f"Returning {mapped_name}: shape after = {tuple(data_torch.shape)}")

From 719d374bf666e43d70caf75f0a9e88dfa8a270d3 Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Sun, 11 Jan 2026 22:58:44 +0800
Subject: [PATCH 38/58] remove blank lines to make lint happy

---
 convert_hf_to_gguf.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index a6d2810359..cc808aa0d0 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -5088,7 +5088,6 @@ class KimiLinearModel(TextModel):
                 merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
                 if len(merged) == 2:
                     merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
-
             # Build token list
             vocab_size = self.hparams["vocab_size"]
             special_tokens = tokenizer.special_tokens
@@ -5316,14 +5315,11 @@ class KimiLinearModel(TextModel):
         if name.endswith("kv_b_proj.weight"):
             name_kb = name.replace("kv_b_proj", "k_b_proj")
             name_vb = name.replace("kv_b_proj", "v_b_proj")
-
             n_head_kv = self.hparams["num_key_value_heads"]
             v_head_dim = self.hparams["v_head_dim"]
             qk_nope_head_dim = self.hparams["qk_nope_head_dim"]
             logger.info("Split kv_b n_head_kv %d\n" % n_head_kv)
-
             assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim)
-
             kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1])
             k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1)
             k_b = k_b.transpose(1, 2)

From ac85cb137563289ca38718c8a78d59891dc24b5f Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Mon, 12 Jan 2026 08:14:51 +0800
Subject: [PATCH 39/58] removed at least blank line containing white space

---
 convert_hf_to_gguf.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index cc808aa0d0..312256e756 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -5062,7 +5062,7 @@ class CodeShellModel(TextModel):
 class KimiLinearModel(TextModel):
     """Kimi-Linear model with hybrid MLA+KDA architecture"""
     model_arch = gguf.MODEL_ARCH.KIMI_LINEAR
-    
+
     _experts: list[dict[str, Tensor]] | None = None
 
     def set_vocab(self):
@@ -5127,7 +5127,7 @@ class KimiLinearModel(TextModel):
         super().set_gguf_parameters()
         self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
         self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
- 
+
         # Use find_hparam for context length
         # Kimi uses model_max_length
         n_ctx = self.find_hparam(["max_position_embeddings", "model_max_length", "n_ctx", "n_positions"], optional=True)
@@ -5156,7 +5156,6 @@ class KimiLinearModel(TextModel):
         ssm_d_conv = self.hparams.get("ssm_d_conv") or linear_attn_config.get("short_conv_kernel_size")
         if ssm_d_conv is not None:
             self.gguf_writer.add_ssm_conv_kernel(ssm_d_conv)
-
         kda_head_dim = self.hparams.get("kda_head_dim") or linear_attn_config.get("head_dim")
         if kda_head_dim is not None:
             self.gguf_writer.add_kda_head_dim(kda_head_dim)
@@ -5296,8 +5295,8 @@ class KimiLinearModel(TextModel):
                 # merge the experts into a single 3d tensor
                 tensors = []
                 # w1: gate, w2: down, w3: up
-                for wid, tname in [("w1", gguf.MODEL_TENSOR.FFN_GATE_EXP), 
-                                   ("w2", gguf.MODEL_TENSOR.FFN_DOWN_EXP), 
+                for wid, tname in [("w1", gguf.MODEL_TENSOR.FFN_GATE_EXP),
+                                   ("w2", gguf.MODEL_TENSOR.FFN_DOWN_EXP),
                                    ("w3", gguf.MODEL_TENSOR.FFN_UP_EXP)]:
                     datas: list[Tensor] = []
                     for xid in range(n_experts):

From 4faf26c376b7694f083523edd4f50923881102cf Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Mon, 12 Jan 2026 08:26:47 +0800
Subject: [PATCH 40/58] fixed flake8 complaints locally

---
 convert_hf_to_gguf.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 312256e756..1be740dba6 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -5146,11 +5146,11 @@ class KimiLinearModel(TextModel):
         _num_kv_heads = list()
         _full_attn_layers = linear_attn_config["full_attn_layers"]
         for il in range(self.hparams["num_hidden_layers"]):
-            if il+1 in _full_attn_layers:
+            if il + 1 in _full_attn_layers:
                 _num_kv_heads.append(self.hparams["num_key_value_heads"])
             else:
                 _num_kv_heads.append(0)
-        assert(len(_num_kv_heads) == self.hparams["num_hidden_layers"])
+        assert len(_num_kv_heads) == self.hparams["num_hidden_layers"]
         self.gguf_writer.add_head_count_kv(_num_kv_heads)
 
         ssm_d_conv = self.hparams.get("ssm_d_conv") or linear_attn_config.get("short_conv_kernel_size")
@@ -5328,6 +5328,7 @@ class KimiLinearModel(TextModel):
         logger.info(f"Returning {mapped_name}: shape after = {tuple(data_torch.shape)}")
         return [(mapped_name, data_torch)]
 
+
 @ModelBase.register("InternLM2ForCausalLM")
 class InternLM2Model(TextModel):
     model_arch = gguf.MODEL_ARCH.INTERNLM2

From 22bc582a82f2419bf446c2984ee1b12d4681ce4b Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Mon, 12 Jan 2026 20:32:19 +0800
Subject: [PATCH 41/58] return ggml_tensor * pair in kda_autoregressive and
 kda_chunking as in ngxson's Qwen3Next improvement

---
 src/models/kimi-linear.cpp | 59 +++++++++++++++-----------------------
 src/models/models.h        |  4 +--
 2 files changed, 25 insertions(+), 38 deletions(-)

diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp
index b0330e23b3..4831b7bbc7 100644
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@@ -263,34 +263,21 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
             ggml_tensor * state = build_rs(inp_rs, ssm_states_all, hparams.n_embd_s(), n_seqs);
             state = ggml_reshape_4d(ctx0, state, head_dim, head_dim, n_head, n_seqs);
             // Choose between build_kda_chunking and build_kda_recurrent based on n_tokens
-            ggml_tensor * attn_out = n_seq_tokens == 1 ?
+            std::pair<ggml_tensor *, ggml_tensor *> attn_out = n_seq_tokens == 1 ?
                 build_kda_autoregressive(Qcur, Kcur, Vcur, g1, beta, state, il) :
                 build_kda_chunking(Qcur, Kcur, Vcur, g1, beta, state, chunked_causal_mask, chunked_identity, chunked_diag_mask, il);
-            cb(attn_out, "attn_out", il);
 
-           // The tensors were concatenated 1d, so we need to extract them 1d as well
-           const int64_t output_flat_size = head_dim * n_head * n_seq_tokens * n_seqs;
-           ggml_tensor * attn_out_1d      = ggml_view_1d(ctx0, attn_out, output_flat_size, 0);
-           cb(attn_out_1d, "attn_out_1d", il);
+            ggml_tensor * output = attn_out.first;
+            ggml_tensor * new_state = attn_out.second;
+            cb(output, "attn_output", il);
+            cb(new_state, "new_state", il);
 
-           ggml_tensor * attn_out_final = ggml_reshape_3d(ctx0, attn_out_1d, head_dim, n_head, n_seq_tokens * n_seqs);
-           cb(attn_out_final, "attn_out_reshaped", il);
-           // Extract the state part (second part of the concatenated tensor)
-           // State starts after n_tokens elements along dimension 1
-           const int64_t state_flat_size = head_dim * head_dim * n_head * n_seqs;
-
-           ggml_tensor * state_1d =
-               ggml_view_1d(ctx0, attn_out, state_flat_size, output_flat_size * ggml_element_size(attn_out));
-           cb(state_1d, "state_1d", il);
-
-           // Update the recurrent states
-           ggml_build_forward_expand(gf,
-                                     ggml_cpy(ctx0, state_1d,
+            // Update the recurrent states
+            ggml_build_forward_expand(gf,
+                                     ggml_cpy(ctx0, new_state,
                                               ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs,
                                                            kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all))));
 
-           GGML_ASSERT(ggml_nelements(attn_out_1d) + ggml_nelements(state_1d) == ggml_nelements(attn_out));
-
             // Step 7: Output gating g2 = g_b(g_a(x))
             ggml_tensor * cur_2d = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
             ggml_tensor * g_a = ggml_mul_mat(ctx0, layer.ssm_g_a, cur_2d);
@@ -301,6 +288,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
             // Step 8: Apply o_norm with sigmoid gating
             // Note: Kimi model uses sigmoid gating, not SiLU (despite FusedRMSNormGated default being swish)
             // Formula: output = RMSNorm(x) * sigmoid(g)
+            ggml_tensor * attn_out_final = ggml_reshape_3d(ctx0, output, head_dim, n_head,  n_seq_tokens * n_seqs);
             ggml_tensor * normed = build_norm(attn_out_final, layer.ssm_o_norm, layer.ssm_o_norm_b, LLM_NORM_RMS, il);
             cb(normed, "kda_normed", il);
             ggml_tensor * gate = ggml_sigmoid(ctx0, g2);
@@ -496,7 +484,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
     This is a ggml implementation of the naive_chunk_kda function of
     https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/kda/naive.py
 */
-ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
+std::pair<ggml_tensor *, ggml_tensor *> llm_build_kimi_linear::build_kda_chunking(
         ggml_tensor * q,
         ggml_tensor * k,
         ggml_tensor * v,
@@ -774,20 +762,23 @@ ggml_tensor * llm_build_kimi_linear::build_kda_chunking(
 
     core_attn_out = ggml_cont_4d(ctx0, core_attn_out, S_v, chunk_size * n_chunks, H_v, n_seqs);
 
-    ggml_tensor * output_tokens = ggml_view_4d(ctx0, core_attn_out, S_v, n_tokens, H_v, n_seqs, core_attn_out->nb[1], core_attn_out->nb[2], core_attn_out->nb[3], 0);
-    cb(output_tokens, "output_tokens", il);
+    // truncate padded tokens
+    ggml_tensor * output_tokens = ggml_view_4d(ctx0, core_attn_out,
+            S_v, n_tokens, H_v, n_seqs,
+            ggml_row_size(core_attn_out->type, S_v),
+            ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks),
+            ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks * H_v), 0);
+    output_tokens = ggml_cont(ctx0, output_tokens);
+    // permute back to (S_v, H_v, n_tokens, n_seqs)
+    output_tokens = ggml_permute(ctx0, output_tokens, 0, 2, 1, 3);
+    output_tokens = ggml_cont(ctx0, output_tokens);
 
-    // flatten output
-    ggml_tensor * flat_output =
-        ggml_cont_1d(ctx0, ggml_permute(ctx0, output_tokens, 0, 2, 1, 3), S_v * H_v * n_tokens * n_seqs);
-
-    ggml_tensor * flat_state = ggml_cont_1d(ctx0, new_state, S_v * S_v * H_v * n_seqs);
     cb(new_state, "output_state", il);
 
-    return ggml_concat(ctx0, flat_output, flat_state, 0);
+    return {output_tokens, new_state};
 }
 
-ggml_tensor * llm_build_kimi_linear::build_kda_autoregressive(
+std::pair<ggml_tensor *, ggml_tensor *> llm_build_kimi_linear::build_kda_autoregressive(
     ggml_tensor * q,
     ggml_tensor * k,
     ggml_tensor * v,
@@ -876,10 +867,6 @@ ggml_tensor * llm_build_kimi_linear::build_kda_autoregressive(
     cb(core_attn_out, "output_tokens", il);
     cb(state, "new_state", il);
 
-    // flatten output, no need to permute since n_tokens is 1 so [S_v, 1, H_v, n_seqs] and [S_v, H_v, 1, n_seqs] are equivalent memory-layout wise
-    ggml_tensor * flat_output = ggml_reshape_1d(ctx0, core_attn_out, S_v * H_v * n_tokens * n_seqs);
-    ggml_tensor * flat_state  = ggml_reshape_1d(ctx0, state, S_v * S_v * H_v * n_seqs);
-
-    return ggml_concat(ctx0, flat_output, flat_state, 0);
+    return {core_attn_out, state};
 }
 
diff --git a/src/models/models.h b/src/models/models.h
index 549329e15a..8e8f502e78 100644
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -287,7 +287,7 @@ struct llm_build_jamba : public llm_graph_context_mamba {
 struct llm_build_kimi_linear : public llm_graph_context_mamba {
     llm_build_kimi_linear(const llama_model & model, const llm_graph_params & params);
 private:
-    ggml_tensor * build_kda_autoregressive(
+    std::pair<ggml_tensor *, ggml_tensor *> build_kda_autoregressive(
                 ggml_tensor * q,
                 ggml_tensor * k,
                 ggml_tensor * v,
@@ -296,7 +296,7 @@ private:
                 ggml_tensor * state,
                         int   il);
 
-    ggml_tensor * build_kda_chunking(
+    std::pair<ggml_tensor *, ggml_tensor *> build_kda_chunking(
                 ggml_tensor * q,
                 ggml_tensor * k,
                 ggml_tensor * v,

From 6ba78d1220c0dd8bf8b37574cd0eb00650761188 Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Tue, 13 Jan 2026 16:31:29 +0800
Subject: [PATCH 42/58] removed Kimi-Linear specific change that causes failure
 at server-windows

---
 src/llama-graph.cpp | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index b0a6ea323f..67f6712744 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -1963,15 +1963,11 @@ ggml_tensor * llm_graph_context::build_rs(
     ggml_tensor * output_states = get_state_rows(ctx0, states, state_copy_main);
     ggml_build_forward_expand(gf, output_states);
 
-    // copy extra states which won't be changed further (between n_seqs and n_rs)
-    // Skip if there are no extra states to copy (n_rs == n_seqs)
-    if (arch != LLM_ARCH_KIMI_LINEAR || n_rs > (u_int32_t) n_seqs) { // arch check for backward compat
-        ggml_tensor * states_extra = ggml_get_rows(ctx0, states, state_copy_extra);
-        ggml_build_forward_expand(gf,
-            ggml_cpy(ctx0,
-                states_extra,
-                ggml_view_1d(ctx0, s, state_size*(n_rs - n_seqs), (rs_head + n_seqs)*state_size*ggml_element_size(s))));
-    }
+    ggml_tensor * states_extra = ggml_get_rows(ctx0, states, state_copy_extra);
+    ggml_build_forward_expand(gf,
+        ggml_cpy(ctx0,
+            states_extra,
+            ggml_view_1d(ctx0, s, state_size*(n_rs - n_seqs), (rs_head + n_seqs)*state_size*ggml_element_size(s))));
 
     return output_states;
 }

From fe9d248ae6f845756769504330803bf8a685307f Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Tue, 13 Jan 2026 16:58:59 +0800
Subject: [PATCH 43/58] removed private: from kimi_linear to make build checks
 happy

---
 src/models/models.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/models/models.h b/src/models/models.h
index beb2d71af3..aec6d3bf48 100644
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -286,7 +286,7 @@ struct llm_build_jamba : public llm_graph_context_mamba {
 
 struct llm_build_kimi_linear : public llm_graph_context_mamba {
     llm_build_kimi_linear(const llama_model & model, const llm_graph_params & params);
-private:
+
     std::pair<ggml_tensor *, ggml_tensor *> build_kda_autoregressive(
                 ggml_tensor * q,
                 ggml_tensor * k,

From 18ae7f4684ccc0ab72eaf35845b84b009da5486d Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Wed, 14 Jan 2026 03:22:53 +0800
Subject: [PATCH 44/58] removed unnecessary ggml_cont before ggml_reshape

---
 src/models/kimi-linear.cpp | 33 ++++++++++-----------------------
 1 file changed, 10 insertions(+), 23 deletions(-)

diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp
index 4831b7bbc7..50cebb9631 100644
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@@ -134,12 +134,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
             ggml_tensor * conv_weight = nullptr;
             if (layer.ssm_q_conv) {
                 // Reshape conv weight from [d_conv, 1, d_inner, 1] to [d_conv, d_inner] for ggml_ssm_conv
-                // Cast to F32 if quantized (ggml_ssm_conv requires float weights)
-                ggml_tensor * q_conv_f32 = layer.ssm_q_conv;
-                if (q_conv_f32->type != GGML_TYPE_F32) {
-                    q_conv_f32 = ggml_cast(ctx0, q_conv_f32, GGML_TYPE_F32);
-                }
-                conv_weight = ggml_reshape_2d(ctx0, q_conv_f32, d_conv, d_inner);
+                conv_weight = ggml_reshape_2d(ctx0, layer.ssm_q_conv, d_conv, d_inner);
             }
 
             // Apply conv1d
@@ -166,7 +161,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
             ggml_tensor * Kcur;
             if (layer.ssm_k_conv) {
                 ggml_tensor * k_3d = ggml_reshape_3d(ctx0, k_proj, d_inner, n_seq_tokens, n_seqs);
-                ggml_tensor * conv_k = ggml_cont(ctx0, ggml_concat(ctx0, conv_state_k, ggml_transpose(ctx0, k_3d), 0));
+                ggml_tensor * conv_k = ggml_concat(ctx0, conv_state_k, ggml_transpose(ctx0, k_3d), 0);
 
                 // Save K conv state
                 ggml_tensor * last_conv_k = ggml_view_3d(ctx0, conv_k, d_conv - 1, d_inner, n_seqs,
@@ -176,11 +171,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
                         ggml_view_1d(ctx0, conv_states_all, conv_state_size * n_seqs,
                             (kv_head * n_embd_r_total + conv_state_size) * ggml_element_size(conv_states_all))));
 
-                ggml_tensor * k_conv_f32 = layer.ssm_k_conv;
-                if (k_conv_f32->type != GGML_TYPE_F32) {
-                    k_conv_f32 = ggml_cast(ctx0, k_conv_f32, GGML_TYPE_F32);
-                }
-                ggml_tensor * k_conv_weight = ggml_reshape_2d(ctx0, k_conv_f32, d_conv, d_inner);
+                ggml_tensor * k_conv_weight = ggml_reshape_2d(ctx0, layer.ssm_k_conv, d_conv, d_inner);
                 Kcur = ggml_ssm_conv(ctx0, conv_k, k_conv_weight);
                 cb(Kcur, "K conv1d", il);
                 Kcur = ggml_reshape_2d(ctx0, Kcur, d_inner, n_tokens);
@@ -197,7 +188,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
             ggml_tensor * Vcur;
             if (layer.ssm_v_conv) {
                 ggml_tensor * v_3d = ggml_reshape_3d(ctx0, v_proj, d_inner, n_seq_tokens, n_seqs);
-                ggml_tensor * conv_v = ggml_cont(ctx0, ggml_concat(ctx0, conv_state_v, ggml_transpose(ctx0, v_3d), 0));
+                ggml_tensor * conv_v = ggml_concat(ctx0, conv_state_v, ggml_transpose(ctx0, v_3d), 0);
 
                 // Save V conv state
                 ggml_tensor * last_conv_v = ggml_view_3d(ctx0, conv_v, d_conv - 1, d_inner, n_seqs,
@@ -207,11 +198,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
                         ggml_view_1d(ctx0, conv_states_all, conv_state_size * n_seqs,
                             (kv_head * n_embd_r_total + 2 * conv_state_size) * ggml_element_size(conv_states_all))));
 
-                ggml_tensor * v_conv_f32 = layer.ssm_v_conv;
-                if (v_conv_f32->type != GGML_TYPE_F32) {
-                    v_conv_f32 = ggml_cast(ctx0, v_conv_f32, GGML_TYPE_F32);
-                }
-                ggml_tensor * v_conv_weight = ggml_reshape_2d(ctx0, v_conv_f32, d_conv, d_inner);
+                ggml_tensor * v_conv_weight = ggml_reshape_2d(ctx0, layer.ssm_v_conv, d_conv, d_inner);
                 Vcur = ggml_ssm_conv(ctx0, conv_v, v_conv_weight);
                 cb(Vcur, "V conv1d", il);
                 Vcur = ggml_reshape_2d(ctx0, Vcur, d_inner, n_tokens);
@@ -243,17 +230,17 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
 
             // Step 4: Compute beta (mixing coefficient)
             ggml_tensor * beta = ggml_mul_mat(ctx0, layer.ssm_beta, cur);
-            beta = ggml_cont_4d(ctx0, beta, n_head, 1, n_seq_tokens, n_seqs);
+            beta = ggml_reshape_4d(ctx0, beta, n_head, 1, n_seq_tokens, n_seqs);
             cb(beta, "kda_beta", il);
 
             // Step 5: Reshape for KDA recurrence
             // {n_embd, n_tokens} -> {n_embd, n_seq_tokens, n_seqs}
             cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
 
-            Qcur = ggml_cont(ctx0, ggml_reshape_4d(ctx0, Qcur, head_dim, n_head, n_seq_tokens, n_seqs));
-            Kcur = ggml_cont(ctx0, ggml_reshape_4d(ctx0, Kcur, head_dim, n_head, n_seq_tokens, n_seqs));
-            Vcur = ggml_cont(ctx0, ggml_reshape_4d(ctx0, Vcur, head_dim, n_head, n_seq_tokens, n_seqs));
-            g1 = ggml_cont(ctx0, ggml_reshape_4d(ctx0, g1, head_dim, n_head, n_seq_tokens, n_seqs));
+            Qcur = ggml_reshape_4d(ctx0, Qcur, head_dim, n_head, n_seq_tokens, n_seqs);
+            Kcur = ggml_reshape_4d(ctx0, Kcur, head_dim, n_head, n_seq_tokens, n_seqs);
+            Vcur = ggml_reshape_4d(ctx0, Vcur, head_dim, n_head, n_seq_tokens, n_seqs);
+            g1 = ggml_reshape_4d(ctx0, g1, head_dim, n_head, n_seq_tokens, n_seqs);
             cb(Qcur, "kda_Q", il);
             cb(Kcur, "kda_K", il);
             cb(Vcur, "kda_V", il);

From 28829152588d35ba8e187f983ecdb1d4cfd232f8 Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Wed, 14 Jan 2026 17:26:00 +0800
Subject: [PATCH 45/58] created static function causal_conv1d to abtract
 similar code for q/k/v

---
 src/models/kimi-linear.cpp | 201 ++++++++++++-------------------------
 1 file changed, 64 insertions(+), 137 deletions(-)

diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp
index 50cebb9631..25eccd2f7d 100644
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@@ -3,6 +3,67 @@
 
 #define CHUNK_SIZE 64
 
+// Causal Conv1d function for Q,K,V
+// When qkv is 0, it is Q, 1 is K, 2 is V
+static ggml_tensor * causal_conv1d(ggml_cgraph * gf, ggml_context * ctx0, ggml_tensor * conv_states_all, ggml_tensor * conv_state_all, int64_t qkv, ggml_tensor * x, ggml_tensor * proj_w, ggml_tensor * conv_w, ggml_tensor * conv_b, int64_t d_conv, int64_t head_dim, int64_t n_head, int64_t n_seq_tokens, int64_t n_seqs, int64_t n_tokens, int64_t kv_head) {
+    const int64_t d_inner = head_dim * n_head;
+    const int64_t conv_state_size = (d_conv - 1) * d_inner;
+    const int64_t n_embd_r_total = 3 * conv_state_size;  // Q + K + V
+
+    // conv_state_all is [n_embd_r_total, n_seqs], split into Q, K, V
+    // Each conv state is [(d_conv-1) * d_inner] per sequence, need to reshape to [d_conv-1, d_inner, n_seqs]
+    // Memory layout: for each seq, Q state is first conv_state_size elements, then K, then V
+    // conv_state_all has stride: nb[0] = element_size, nb[1] = n_embd_r_total * element_size
+    // View Q conv state: offset 0, size conv_state_size per seq
+    // conv_state_all is [n_embd_r_total, n_seqs] with memory layout:
+    //   state[i + seq * n_embd_r_total] where i = conv_step + channel * (d_conv-1) + {0, conv_state_size, 2*conv_state_size} for Q/K/V
+    // We want [d_conv-1, d_inner, n_seqs] view:
+    //   nb1 = (d_conv-1) * element_size (stride between channels)
+    //   nb2 = n_embd_r_total * element_size (stride between seqs)
+    ggml_tensor * conv_state_x = ggml_view_3d(ctx0, conv_state_all, d_conv - 1, d_inner, n_seqs,
+        (d_conv - 1) * ggml_element_size(conv_state_all),  // nb1: stride between channels
+        n_embd_r_total * ggml_element_size(conv_state_all),  // nb2: stride between seqs
+        qkv * conv_state_size * ggml_element_size(conv_state_all));
+
+// Causal Conv1d function for Q,K,V
+// When qkv is 0, it is Q, 1 is K, 2 is V
+    // Step 1: Q, K, V projections -> [d_inner, n_tokens]
+    ggml_tensor * x_proj = ggml_mul_mat(ctx0, proj_w, x);
+
+    // Reshape input: {d_inner, n_tokens} -> {d_inner, n_seq_tokens, n_seqs}
+    ggml_tensor * x_3d = ggml_reshape_3d(ctx0, x_proj, d_inner, n_seq_tokens, n_seqs);
+
+    // Concat Q conv state and current input: {d_conv-1 + n_seq_tokens, d_inner, n_seqs}
+    ggml_tensor * conv_x = ggml_cont(ctx0, ggml_concat(ctx0, conv_state_x, ggml_transpose(ctx0, x_3d), 0));
+
+    // Save last (d_conv-1) columns back to Q conv state
+    ggml_tensor * last_conv_x = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs,
+        conv_x->nb[1], conv_x->nb[2], n_seq_tokens * conv_x->nb[0]);
+    ggml_build_forward_expand(gf,
+        ggml_cpy(ctx0, last_conv_x,
+            ggml_view_1d(ctx0, conv_states_all, conv_state_size * n_seqs,
+                (kv_head * n_embd_r_total + qkv * conv_state_size) * ggml_element_size(conv_states_all))));
+    // Reshape conv weight: GGUF [d_conv, 1, d_inner, 1] -> ggml_ssm_conv expects [d_conv, d_inner]
+    // GGUF stores as [d_conv, 1, d_inner, 1] with memory layout w[conv_step + channel * d_conv]
+    // vLLM stores as [d_inner, d_conv] with memory layout w[channel * d_conv + conv_step]
+    // ggml_ssm_conv computes: c[conv_step + channel * d_conv]
+    // GGUF layout: [d_conv, 1, d_inner] or [d_conv, 1, d_inner, 1] -> reshape to [d_conv, d_inner]
+    // Reshape conv weight from [d_conv, 1, d_inner, 1] to [d_conv, d_inner] for ggml_ssm_conv
+    ggml_tensor * conv_weight = ggml_reshape_2d(ctx0, conv_w, d_conv, d_inner);
+
+    // Apply conv1d
+    // ggml_ssm_conv output: {d_inner, n_seq_tokens, n_seqs}
+    ggml_tensor * Xcur = ggml_ssm_conv(ctx0, conv_x, conv_weight);
+    // Reshape to 2D for bias add: {d_inner, n_tokens}
+    Xcur = ggml_reshape_2d(ctx0, Xcur, d_inner, n_tokens);
+    if (conv_b) {
+        Xcur = ggml_add(ctx0, Xcur, conv_b);
+    }
+    Xcur = ggml_silu(ctx0, Xcur);
+
+    return ggml_reshape_4d(ctx0, Xcur, head_dim, n_head, n_seq_tokens, n_seqs);
+}
+
 llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const llm_graph_params & params) :
     llm_graph_context_mamba(params), model(model) {
     ggml_tensor * cur;
@@ -78,138 +139,10 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
             // Get conv states from r_l tensor (Q, K, V each have separate state)
             ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
             cb(conv_states_all, "conv_states_all", il);
-            const int64_t conv_state_size = (d_conv - 1) * d_inner;
-            const int64_t n_embd_r_total = 3 * conv_state_size;  // Q + K + V
             ggml_tensor * conv_state_all = build_rs(inp_rs, conv_states_all, hparams.n_embd_r(), n_seqs);
-            // conv_state_all is [n_embd_r_total, n_seqs], split into Q, K, V
-            // Each conv state is [(d_conv-1) * d_inner] per sequence, need to reshape to [d_conv-1, d_inner, n_seqs]
-            // Memory layout: for each seq, Q state is first conv_state_size elements, then K, then V
-            // conv_state_all has stride: nb[0] = element_size, nb[1] = n_embd_r_total * element_size
-            // View Q conv state: offset 0, size conv_state_size per seq
-            // conv_state_all is [n_embd_r_total, n_seqs] with memory layout:
-            //   state[i + seq * n_embd_r_total] where i = conv_step + channel * (d_conv-1) + {0, conv_state_size, 2*conv_state_size} for Q/K/V
-            // We want [d_conv-1, d_inner, n_seqs] view:
-            //   nb1 = (d_conv-1) * element_size (stride between channels)
-            //   nb2 = n_embd_r_total * element_size (stride between seqs)
-            ggml_tensor * conv_state_q = ggml_view_3d(ctx0, conv_state_all, d_conv - 1, d_inner, n_seqs,
-                (d_conv - 1) * ggml_element_size(conv_state_all),  // nb1: stride between channels
-                n_embd_r_total * ggml_element_size(conv_state_all),  // nb2: stride between seqs
-                0);  // offset for Q
-            ggml_tensor * conv_state_k = ggml_view_3d(ctx0, conv_state_all, d_conv - 1, d_inner, n_seqs,
-                (d_conv - 1) * ggml_element_size(conv_state_all),
-                n_embd_r_total * ggml_element_size(conv_state_all),
-                conv_state_size * ggml_element_size(conv_state_all));  // offset for K
-            ggml_tensor * conv_state_v = ggml_view_3d(ctx0, conv_state_all, d_conv - 1, d_inner, n_seqs,
-                (d_conv - 1) * ggml_element_size(conv_state_all),
-                n_embd_r_total * ggml_element_size(conv_state_all),
-                2 * conv_state_size * ggml_element_size(conv_state_all));  // offset for V
-
-            // Step 1: Q, K, V projections -> [d_inner, n_tokens]
-            ggml_tensor * q_proj = ggml_mul_mat(ctx0, layer.wq, cur);
-            ggml_tensor * k_proj = ggml_mul_mat(ctx0, layer.wk, cur);
-            ggml_tensor * v_proj = ggml_mul_mat(ctx0, layer.wv, cur);
-            cb(q_proj, "kda_q_proj", il);
-            cb(k_proj, "kda_k_proj", il);
-            cb(v_proj, "kda_v_proj", il);
-
-            // Step 2: Causal Conv1d for Q
-            // Reshape input: {d_inner, n_tokens} -> {d_inner, n_seq_tokens, n_seqs}
-            ggml_tensor * q_3d = ggml_reshape_3d(ctx0, q_proj, d_inner, n_seq_tokens, n_seqs);
-
-            // Concat Q conv state and current input: {d_conv-1 + n_seq_tokens, d_inner, n_seqs}
-            ggml_tensor * conv_q = ggml_concat(ctx0, conv_state_q, ggml_transpose(ctx0, q_3d), 0);
-
-            // Save last (d_conv-1) columns back to Q conv state
-            ggml_tensor * last_conv_q = ggml_view_3d(ctx0, conv_q, d_conv - 1, d_inner, n_seqs,
-                conv_q->nb[1], conv_q->nb[2], n_seq_tokens * conv_q->nb[0]);
-            ggml_build_forward_expand(gf,
-                ggml_cpy(ctx0, last_conv_q,
-                    ggml_view_1d(ctx0, conv_states_all, conv_state_size * n_seqs,
-                        kv_head * n_embd_r_total * ggml_element_size(conv_states_all))));
-            // Reshape conv weight: GGUF [d_conv, 1, d_inner, 1] -> ggml_ssm_conv expects [d_conv, d_inner]
-            // GGUF stores as [d_conv, 1, d_inner, 1] with memory layout w[conv_step + channel * d_conv]
-            // vLLM stores as [d_inner, d_conv] with memory layout w[channel * d_conv + conv_step]
-            // ggml_ssm_conv computes: c[conv_step + channel * d_conv]
-            // GGUF layout: [d_conv, 1, d_inner] or [d_conv, 1, d_inner, 1] -> reshape to [d_conv, d_inner]
-            ggml_tensor * conv_weight = nullptr;
-            if (layer.ssm_q_conv) {
-                // Reshape conv weight from [d_conv, 1, d_inner, 1] to [d_conv, d_inner] for ggml_ssm_conv
-                conv_weight = ggml_reshape_2d(ctx0, layer.ssm_q_conv, d_conv, d_inner);
-            }
-
-            // Apply conv1d
-            ggml_tensor * Qcur;
-            if (conv_weight) {
-                // Make conv_q contiguous for ggml_ssm_conv
-                conv_q = ggml_cont(ctx0, conv_q);
-
-                // ggml_ssm_conv output: {d_inner, n_seq_tokens, n_seqs}
-                Qcur = ggml_ssm_conv(ctx0, conv_q, conv_weight);
-                cb(Qcur, "Q conv1d", il);
-                // Reshape to 2D for bias add: {d_inner, n_tokens}
-                Qcur = ggml_reshape_2d(ctx0, Qcur, d_inner, n_tokens);
-                if (layer.ssm_q_conv_b) {
-                    Qcur = ggml_add(ctx0, Qcur, layer.ssm_q_conv_b);
-                }
-                Qcur = ggml_silu(ctx0, Qcur);
-                cb(Qcur, "Q conv1d b", il);
-            } else {
-                GGML_ABORT("KDA layer missing Q conv weight");
-            }
-
-            // K conv1d (with separate K conv state)
-            ggml_tensor * Kcur;
-            if (layer.ssm_k_conv) {
-                ggml_tensor * k_3d = ggml_reshape_3d(ctx0, k_proj, d_inner, n_seq_tokens, n_seqs);
-                ggml_tensor * conv_k = ggml_concat(ctx0, conv_state_k, ggml_transpose(ctx0, k_3d), 0);
-
-                // Save K conv state
-                ggml_tensor * last_conv_k = ggml_view_3d(ctx0, conv_k, d_conv - 1, d_inner, n_seqs,
-                    conv_k->nb[1], conv_k->nb[2], n_seq_tokens * conv_k->nb[0]);
-                ggml_build_forward_expand(gf,
-                    ggml_cpy(ctx0, last_conv_k,
-                        ggml_view_1d(ctx0, conv_states_all, conv_state_size * n_seqs,
-                            (kv_head * n_embd_r_total + conv_state_size) * ggml_element_size(conv_states_all))));
-
-                ggml_tensor * k_conv_weight = ggml_reshape_2d(ctx0, layer.ssm_k_conv, d_conv, d_inner);
-                Kcur = ggml_ssm_conv(ctx0, conv_k, k_conv_weight);
-                cb(Kcur, "K conv1d", il);
-                Kcur = ggml_reshape_2d(ctx0, Kcur, d_inner, n_tokens);
-                if (layer.ssm_k_conv_b) {
-                    Kcur = ggml_add(ctx0, Kcur, layer.ssm_k_conv_b);
-                }
-                Kcur = ggml_silu(ctx0, Kcur);
-                cb(Kcur, "K conv1d b", il);
-            } else {
-                GGML_ABORT("KDA layer missing K conv weight");
-            }
-
-            // V conv1d (with separate V conv state)
-            ggml_tensor * Vcur;
-            if (layer.ssm_v_conv) {
-                ggml_tensor * v_3d = ggml_reshape_3d(ctx0, v_proj, d_inner, n_seq_tokens, n_seqs);
-                ggml_tensor * conv_v = ggml_concat(ctx0, conv_state_v, ggml_transpose(ctx0, v_3d), 0);
-
-                // Save V conv state
-                ggml_tensor * last_conv_v = ggml_view_3d(ctx0, conv_v, d_conv - 1, d_inner, n_seqs,
-                    conv_v->nb[1], conv_v->nb[2], n_seq_tokens * conv_v->nb[0]);
-                ggml_build_forward_expand(gf,
-                    ggml_cpy(ctx0, last_conv_v,
-                        ggml_view_1d(ctx0, conv_states_all, conv_state_size * n_seqs,
-                            (kv_head * n_embd_r_total + 2 * conv_state_size) * ggml_element_size(conv_states_all))));
-
-                ggml_tensor * v_conv_weight = ggml_reshape_2d(ctx0, layer.ssm_v_conv, d_conv, d_inner);
-                Vcur = ggml_ssm_conv(ctx0, conv_v, v_conv_weight);
-                cb(Vcur, "V conv1d", il);
-                Vcur = ggml_reshape_2d(ctx0, Vcur, d_inner, n_tokens);
-                if (layer.ssm_v_conv_b) {
-                    Vcur = ggml_add(ctx0, Vcur, layer.ssm_v_conv_b);
-                }
-                Vcur = ggml_silu(ctx0, Vcur);
-                cb(Vcur, "V conv1d b", il);
-            } else {
-                GGML_ABORT("KDA layer missing V conv weight");
-            }
+            ggml_tensor * Qcur = causal_conv1d(gf, ctx0, conv_states_all, conv_state_all, 0, cur, layer.wq, layer.ssm_q_conv, layer.ssm_q_conv_b, d_conv, head_dim, n_head, n_seq_tokens, n_seqs, n_tokens, kv_head);
+            ggml_tensor * Kcur = causal_conv1d(gf, ctx0, conv_states_all, conv_state_all, 1, cur, layer.wk, layer.ssm_k_conv, layer.ssm_k_conv_b, d_conv, head_dim, n_head, n_seq_tokens, n_seqs, n_tokens, kv_head);
+            ggml_tensor * Vcur = causal_conv1d(gf, ctx0, conv_states_all, conv_state_all, 2, cur, layer.wv, layer.ssm_v_conv, layer.ssm_v_conv_b, d_conv, head_dim, n_head, n_seq_tokens, n_seqs, n_tokens, kv_head);
 
             // Step 3: Compute g1 (forget gate)
             // g1 = -exp(A_log) * softplus(f_b(f_a(x)) + dt_bias)
@@ -237,13 +170,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
             // {n_embd, n_tokens} -> {n_embd, n_seq_tokens, n_seqs}
             cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
 
-            Qcur = ggml_reshape_4d(ctx0, Qcur, head_dim, n_head, n_seq_tokens, n_seqs);
-            Kcur = ggml_reshape_4d(ctx0, Kcur, head_dim, n_head, n_seq_tokens, n_seqs);
-            Vcur = ggml_reshape_4d(ctx0, Vcur, head_dim, n_head, n_seq_tokens, n_seqs);
             g1 = ggml_reshape_4d(ctx0, g1, head_dim, n_head, n_seq_tokens, n_seqs);
-            cb(Qcur, "kda_Q", il);
-            cb(Kcur, "kda_K", il);
-            cb(Vcur, "kda_V", il);
 
             // Step 6: Get SSM state and compute KDA recurrence using ggml_kda_scan
             ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);

From 0aea18e718d38ff9ec27e144e4fc7bee8b875a41 Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Fri, 16 Jan 2026 12:02:27 +0800
Subject: [PATCH 46/58] merged dt_bias to SSM_DT. Do -exp(log_A) in
 convert_hf_to_gguf.py.

---
 convert_hf_to_gguf.py          | 59 +++++++++++++---------------------
 gguf-py/gguf/constants.py      | 13 +++-----
 gguf-py/gguf/tensor_mapping.py | 10 ++----
 src/llama-arch.cpp             |  8 ++---
 src/llama-arch.h               |  2 --
 src/llama-model.cpp            | 18 ++++-------
 src/llama-model.h              |  1 -
 src/models/kimi-linear.cpp     | 10 +++---
 8 files changed, 43 insertions(+), 78 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index bd017dfec4..0e1b6aae99 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -5159,17 +5159,14 @@ class KimiLinearModel(TextModel):
 
         super().set_gguf_parameters()
         self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
-        self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
 
-        # Use find_hparam for context length
-        # Kimi uses model_max_length
-        n_ctx = self.find_hparam(["max_position_embeddings", "model_max_length", "n_ctx", "n_positions"], optional=True)
-        if n_ctx is not None:
-            self.gguf_writer.add_context_length(n_ctx)
-        else:
-            # Default to 4096 if not found
-            logger.warning("No context length found in config, defaulting to 4096")
-            self.gguf_writer.add_context_length(4096)
+        if (score_func := self.find_hparam(["moe_router_activation_func"], optional=True)) is not None:
+            if score_func == "sigmoid":
+                self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
+            elif score_func == "softmax":
+                self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
+            else:
+                raise ValueError(f"Unsupported expert score gating function value: {score_func}")
 
         # KDA & MLA params
         # Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv
@@ -5226,7 +5223,7 @@ class KimiLinearModel(TextModel):
             self.gguf_writer.add_value_length_mla(v_head_dim)
 
         # Rotation - use qk_rope_head_dim for Kimi
-        rope_dim = self.hparams.get("qk_rope_head_dim") or self.hparams.get("n_rot")
+        rope_dim = self.find_hparam(["qk_rope_head_dim", "n_rot"])
         if rope_dim is not None:
             self.gguf_writer.add_rope_dimension_count(rope_dim)
         else:
@@ -5234,41 +5231,30 @@ class KimiLinearModel(TextModel):
             head_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
             self.gguf_writer.add_rope_dimension_count(head_dim)
 
-        # Copied from Qwen2Moe as this model inherits parts of it
-        # YaRN is not enabled by default
-        # To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
-            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
-
-        # MoE params
-        n_experts = self.hparams.get("num_local_experts", self.hparams.get("num_experts"))
+        n_experts = self.find_hparam(["num_experts"])
         if n_experts is not None:
             self.gguf_writer.add_expert_count(n_experts)
-        # Support both num_experts_per_tok and num_experts_per_token
-        n_experts_used = self.hparams.get("num_experts_per_tok", self.hparams.get("num_experts_per_token"))
+        n_experts_used = self.find_hparam(["num_experts_per_token"])
         if n_experts_used is not None:
             self.gguf_writer.add_expert_used_count(n_experts_used)
 
         # moe_intermediate_size (1024 for Kimi)
-        moe_intermediate_size = self.hparams.get("moe_intermediate_size")
+        moe_intermediate_size = self.find_hparam(["moe_intermediate_size"])
         if moe_intermediate_size is not None:
             self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
 
         # num_shared_experts (1 for Kimi)
-        num_shared_experts = self.hparams.get("num_shared_experts")
+        num_shared_experts = self.find_hparam(["num_shared_experts"])
         if num_shared_experts is not None:
             self.gguf_writer.add_expert_shared_count(num_shared_experts)
 
         # first_k_dense_replace (1 for Kimi - first layer uses dense MLP)
-        first_k_dense_replace = self.hparams.get("first_k_dense_replace")
+        first_k_dense_replace = self.find_hparam(["first_k_dense_replace"])
         if first_k_dense_replace is not None:
             self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace)
 
         # Routed scaling factor (expert_weights_scale = 2.446 for Kimi)
-        routed_scaling_factor = self.hparams.get("routed_scaling_factor")
+        routed_scaling_factor = self.find_hparam(["routed_scaling_factor"])
         if routed_scaling_factor is not None:
             self.gguf_writer.add_expert_weights_scale(routed_scaling_factor)
 
@@ -5301,19 +5287,20 @@ class KimiLinearModel(TextModel):
                 data_torch = data_torch.reshape(1, d_inner, 1, d_conv)
                 logger.info(f"Reshaped conv1d weight {name}: [d_inner={d_inner}, 1, d_conv={d_conv}] -> numpy {tuple(data_torch.shape)} -> ggml ne=[{d_conv}, 1, {d_inner}, 1]")
 
-        # Handle A_log: HF stores as [1, 1, num_heads, 1]
-        # llama.cpp expects ggml ne = [1, num_heads, 1, 1]
-        # GGUF reverses numpy shape: numpy (1, 1, num_heads, 1) -> ggml ne = [1, num_heads, 1, 1]
-        # So no transformation needed! The shapes already match after GGUF reversal.
-        if name.endswith(".A_log"):
-            if data_torch.ndim == 4:
-                logger.info(f"A_log {name}: numpy {tuple(data_torch.shape)} -> ggml ne={list(reversed(data_torch.shape))}")
-
         # Kimi specific bias
         if name.endswith("e_score_correction_bias"):
             new_name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_EXP_PROBS_B, bid)
             return [(new_name, data_torch)]
 
+        # Handle A_log: iHF stores as [1, 1, num_heads, 1]
+        # llama.cpp expects ggml ne = [1, num_heads, 1, 1]
+        # GGUF reverses numpy shape: numpy (1, 1, num_heads, 1) -> ggml ne = [1, num_heads, 1, 1]
+        if name.endswith(".A_log"):
+            data_torch = -torch.exp(data_torch)
+        if name.endswith(".dt_bias"):
+            name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"
+            logger.info("Changed dt_bias to dt_proj.bias")
+
         # process the experts separately
         if name.find("block_sparse_moe.experts") != -1:
             n_experts = self.hparams.get("num_local_experts", self.hparams.get("num_experts"))
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 43ea4eec0c..73e7bae6e1 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -462,7 +462,7 @@ class MODEL_ARCH(IntEnum):
     MIMO2            = auto()
     LLAMA_EMBED      = auto()
     MAINCODER        = auto()
-    KIMI_LINEAR      = auto()  # Kimi-Linear (hybrid MLA+KDA)
+    KIMI_LINEAR      = auto()
 
 
 class VISION_PROJECTOR_TYPE(IntEnum):
@@ -559,10 +559,9 @@ class MODEL_TENSOR(IntEnum):
     SSM_F_A              = auto() # Kimi Linear
     SSM_F_B              = auto() # Kimi Linear
     SSM_BETA             = auto() # Kimi Linear
-    SSM_A_LOG            = auto() # Kimi Linear
+    SSM_DT_B             = auto() # Kimi Linear
     SSM_G_A              = auto() # Kimi Linear
     SSM_G_B              = auto() # Kimi Linear
-    SSM_DT_B             = auto() # Kimi Linear
     TIME_MIX_W0          = auto()
     TIME_MIX_W1          = auto()
     TIME_MIX_W2          = auto()
@@ -894,7 +893,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
     MODEL_ARCH.MIMO2:            "mimo2",
     MODEL_ARCH.LLAMA_EMBED:      "llama-embed",
     MODEL_ARCH.MAINCODER:        "maincoder",
-    MODEL_ARCH.KIMI_LINEAR:         "kimi-linear",
+    MODEL_ARCH.KIMI_LINEAR:      "kimi-linear",
 }
 
 VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
@@ -988,10 +987,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
     MODEL_TENSOR.SSM_F_A:                   "blk.{bid}.ssm_f_a",              # Kimi Linear
     MODEL_TENSOR.SSM_F_B:                   "blk.{bid}.ssm_f_b",              # Kimi Linear
     MODEL_TENSOR.SSM_BETA:                  "blk.{bid}.ssm_beta",             # Kimi Linear
-    MODEL_TENSOR.SSM_A_LOG:                 "blk.{bid}.ssm_a",                # Kimi Linear
     MODEL_TENSOR.SSM_G_A:                   "blk.{bid}.ssm_g_a",              # Kimi Linear
     MODEL_TENSOR.SSM_G_B:                   "blk.{bid}.ssm_g_b",              # Kimi Linear
-    MODEL_TENSOR.SSM_DT_B:                  "blk.{bid}.ssm_dt",               # Kimi Linear
     MODEL_TENSOR.TIME_MIX_W0:               "blk.{bid}.time_mix_w0",
     MODEL_TENSOR.TIME_MIX_W1:               "blk.{bid}.time_mix_w1",
     MODEL_TENSOR.TIME_MIX_W2:               "blk.{bid}.time_mix_w2",
@@ -3433,11 +3430,11 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.SSM_F_A,
         MODEL_TENSOR.SSM_F_B,
         MODEL_TENSOR.SSM_BETA,
-        MODEL_TENSOR.SSM_A_LOG,
+        MODEL_TENSOR.SSM_A,
         MODEL_TENSOR.SSM_G_A,
         MODEL_TENSOR.SSM_G_B,
+        MODEL_TENSOR.SSM_DT,
         MODEL_TENSOR.SSM_NORM,
-        MODEL_TENSOR.SSM_DT_B,
         MODEL_TENSOR.FFN_EXP_PROBS_B,
         MODEL_TENSOR.FFN_GATE_SHEXP,
         MODEL_TENSOR.FFN_DOWN_SHEXP,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 99da6891f8..d96119ebe9 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -438,7 +438,6 @@ class TensorNameMap:
             "model.layers.{bid}.block_sparse_moe.e_score_correction",       # minimax-m2
             "backbone.layers.{bid}.mixer.gate.e_score_correction",          # nemotron-h-moe
             "model.layers.{bid}.mlp.e_score_correction",                    # exaone-moe
-            "model.layers.{bid}.block_sparse_moe.gate.e_score_correction_bias",  # kimi
         ),
 
         # Feed-forward up
@@ -556,7 +555,6 @@ class TensorNameMap:
 
         MODEL_TENSOR.FFN_GATE_CHEXP: (
             "model.layers.{bid}.mlp.chunk_experts.gate_proj",           # grovemoe
-            "model.layers.{bid}.block_sparse_moe.shared_experts.gate_proj", # kimi
         ),
 
         # Feed-forward down
@@ -764,6 +762,7 @@ class TensorNameMap:
             "model.layers.layers.{bid}.mixer.dt_proj",  # plamo2
             "model.layers.{bid}.linear_attn.dt_proj",   # qwen3next
             "backbone.layers.{bid}.mixer.dt",           # nemotron-h-moe
+            "model.layers.{bid}.self_attn.dt_proj",     # kimi
         ),
 
         MODEL_TENSOR.SSM_DT_NORM: (
@@ -777,6 +776,7 @@ class TensorNameMap:
             "model.layers.{bid}.mamba.A_log",         # jamba falcon-h1 granite-hybrid
             "model.layers.layers.{bid}.mixer.A_log",  # plamo2
             "model.layers.{bid}.linear_attn.A_log",   # qwen3next
+            "model.layers.{bid}.self_attn.A_log",     # kimi
         ),
 
         MODEL_TENSOR.SSM_B_NORM: (
@@ -836,18 +836,12 @@ class TensorNameMap:
         MODEL_TENSOR.SSM_BETA: (
             "model.layers.{bid}.self_attn.b_proj",
         ),
-        MODEL_TENSOR.SSM_A_LOG: (
-            "model.layers.{bid}.self_attn.A_log",
-        ),
         MODEL_TENSOR.SSM_G_A: (
             "model.layers.{bid}.self_attn.g_a_proj",
         ),
         MODEL_TENSOR.SSM_G_B: (
             "model.layers.{bid}.self_attn.g_b_proj",
         ),
-        MODEL_TENSOR.SSM_DT_B: (
-            "model.layers.{bid}.self_attn.dt_bias",
-        ),
         MODEL_TENSOR.TIME_MIX_W0: (
             "model.layers.{bid}.attention.w0",            # rwkv7
         ),
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 654276542d..a8bf1c9b80 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -380,8 +380,6 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
     { LLM_TENSOR_SSM_F_A,                                "blk.%d.ssm_f_a" },
     { LLM_TENSOR_SSM_F_B,                                "blk.%d.ssm_f_b" },
     { LLM_TENSOR_SSM_BETA,                               "blk.%d.ssm_beta" },
-    { LLM_TENSOR_SSM_A_LOG,                              "blk.%d.ssm_a" },
-    { LLM_TENSOR_SSM_DT_B,                               "blk.%d.ssm_dt" },
     { LLM_TENSOR_SSM_G_A,                                "blk.%d.ssm_g_a" },
     { LLM_TENSOR_SSM_G_B,                                "blk.%d.ssm_g_b" },
     { LLM_TENSOR_SSM_NORM,                               "blk.%d.ssm_norm" },
@@ -2336,10 +2334,10 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                 LLM_TENSOR_SSM_F_A,
                 LLM_TENSOR_SSM_F_B,
                 LLM_TENSOR_SSM_BETA,
-                LLM_TENSOR_SSM_A_LOG,
-                LLM_TENSOR_SSM_DT_B,
+                LLM_TENSOR_SSM_A,
                 LLM_TENSOR_SSM_G_A,
                 LLM_TENSOR_SSM_G_B,
+                LLM_TENSOR_SSM_DT,
                 LLM_TENSOR_SSM_NORM,
                 // MLA
                 LLM_TENSOR_ATTN_Q_A,
@@ -2461,8 +2459,6 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_SSM_F_A,                    {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_SSM_F_B,                    {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_SSM_BETA,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_SSM_A_LOG,                  {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-    {LLM_TENSOR_SSM_DT_B,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
     {LLM_TENSOR_SSM_G_A,                    {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_SSM_G_B,                    {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_TIME_MIX_LERP_X,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
index e5816acee1..f092f72834 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -408,8 +408,6 @@ enum llm_tensor {
     LLM_TENSOR_SSM_F_A,             // kimi: forget gate projection A
     LLM_TENSOR_SSM_F_B,             // kimi: forget gate projection B
     LLM_TENSOR_SSM_BETA,            // kimi: beta mixing coefficient
-    LLM_TENSOR_SSM_A_LOG,           // kimi: A_log (pre-converted in GGUF)
-    LLM_TENSOR_SSM_DT_B,            // kimi: dt bias
     LLM_TENSOR_SSM_G_A,             // kimi: output gate projection A
     LLM_TENSOR_SSM_G_B,             // kimi: output gate projection B
     LLM_TENSOR_TIME_MIX_W0,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 9b796b3675..53f9f389e4 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2468,7 +2468,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,               hparams.n_expert_shared, false);
                 ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead, false);
                 ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
-                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func, false);
+                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func);
 
                 switch (hparams.n_layer) {
                     case 27: type = LLM_TYPE_48B_A3B; break; // Kimi-Linear-48B-A3B
@@ -6839,14 +6839,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                              // b_proj (beta mixing coefficient)
                              layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", i), {n_embd, n_head}, 0);
 
-                             // A_log - Shape in GGUF: [1, num_heads, 1, 1] (4D) or [1, num_heads] (2D after quantization)
-                             layer.ssm_a_log = create_tensor(tn(LLM_TENSOR_SSM_A_LOG, i), {1, n_head, 1, 1}, TENSOR_NOT_REQUIRED);
-                             if (!layer.ssm_a_log) {
-                                 layer.ssm_a_log = create_tensor(tn(LLM_TENSOR_SSM_A_LOG, i), {1, n_head}, 0);
+                             // A_log - Shape in GGUF: [1, num_heads, 1, 1] (4D) or [1, num_heads] (2D after quantization) Note: -exp(A_log) is applied in convert_hf_to_gguf.py
+                             layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head, 1, 1}, TENSOR_NOT_REQUIRED);
+                             if (!layer.ssm_a) {
+                                 layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0);
                              }
 
                              // dt_bias - shape [n_embd_head_k_kda * n_head] = [4096]
-                             layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT_B, i), {n_embd_head_k_kda * n_head}, 0);
+                             layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_embd_head_k_kda * n_head}, 0);
 
                              // g_a_proj, g_b_proj (output gate)
                              layer.ssm_g_a = create_tensor(tn(LLM_TENSOR_SSM_G_A, "weight", i), {n_embd, n_embd_head_k_kda}, 0);
@@ -6918,11 +6918,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                             layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_shexp_actual}, TENSOR_NOT_REQUIRED);
 
                             // exp_probs_b (e_score_correction_bias in vLLM)
-                            // Try "bias" first (standard), then "weight" (for compatibility)
-                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
-                            if (!layer.ffn_exp_probs_b) {
-                                layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "weight", i), {n_expert}, TENSOR_NOT_REQUIRED);
-                            }
+                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "weight", i), {n_expert}, 0);
                         }
                     }
                 } break;
diff --git a/src/llama-model.h b/src/llama-model.h
index 40078dbdbd..a4900b093e 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -422,7 +422,6 @@ struct llama_layer {
     struct ggml_tensor * ssm_f_a    = nullptr;
     struct ggml_tensor * ssm_f_b    = nullptr;
     struct ggml_tensor * ssm_beta   = nullptr;
-    struct ggml_tensor * ssm_a_log  = nullptr;
     struct ggml_tensor * ssm_g_a    = nullptr;
     struct ggml_tensor * ssm_g_b    = nullptr;
     struct ggml_tensor * ssm_o_norm = nullptr;
diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp
index 6db782641d..6013cd0b77 100644
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@@ -127,7 +127,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
 
         // Check layer type by checking which tensors exist
         // KDA layers have ssm_a_log tensor, MLA layers have wkv_a_mqa tensor
-        bool is_kda = (layer.ssm_a_log != nullptr);
+        bool is_kda = (layer.ssm_a != nullptr);
         bool is_mla = (layer.wkv_a_mqa != nullptr);
 
         if (is_kda) {
@@ -152,12 +152,10 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
             g1 = ggml_softplus(ctx0, g1);
             g1 = ggml_reshape_3d(ctx0, g1, head_dim, n_head, n_tokens);
 
-            // A_log shape is [1, n_head] or [1, n_head, 1, 1], need to broadcast to [head_dim, n_head, n_tokens]
-            // First compute -exp(A_log), then reshape for broadcasting
-            ggml_tensor * A_neg_exp = ggml_neg(ctx0, ggml_exp(ctx0, layer.ssm_a_log));
+            // A_log shape is [1, n_head] or [1, n_head, 1, 1], need to broadcast to [head_dim, n_head, n_tokens]. No need to -exp(a_log) because it was done in convert_hf_to_gguf.py
             // Reshape to [1, n_head, 1] for broadcasting with g1 [head_dim, n_head, n_tokens]
-            A_neg_exp = ggml_reshape_3d(ctx0, A_neg_exp, 1, n_head, 1);
-            g1 = ggml_mul(ctx0, g1, A_neg_exp);
+            ggml_tensor * A = ggml_reshape_3d(ctx0, layer.ssm_a, 1, n_head, 1);
+            g1 = ggml_mul(ctx0, g1, A);
             cb(g1, "kda_g1", il);
 
             // Compute beta (mixing coefficient)

From f3d118d061e0630b3126df5b1a855d29813177df Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Sat, 17 Jan 2026 07:43:30 +0800
Subject: [PATCH 47/58] reverted to original

---
 ggml/src/ggml-cpu/ops.cpp | 111 ++++++++++++++++++++++----------------
 1 file changed, 64 insertions(+), 47 deletions(-)

diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 42db45ee14..387e2fe42c 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -7,10 +7,9 @@
 #include "unary-ops.h"
 #include "vec.h"
 
-#include <cfloat>
 #include <algorithm>
+#include <cfloat>
 #include <cmath>
-#include <functional>
 
 // ggml_compute_forward_dup
 
@@ -7110,12 +7109,13 @@ void ggml_compute_forward_conv_2d_dw(
     }
 }
 
-// ggml_compute_forward_pool_1d_sk_p0
-
-static void ggml_compute_forward_pool_1d_sk_p0(
+// ggml_compute_forward_pool_1d_ksp
+static void ggml_compute_forward_pool_1d_ksp(
         const ggml_compute_params * params,
         const ggml_op_pool op,
         const int k,
+        const int s,
+        const int p,
         ggml_tensor * dst) {
 
     const ggml_tensor * src = dst->src[0];
@@ -7126,39 +7126,56 @@ static void ggml_compute_forward_pool_1d_sk_p0(
         return;
     }
 
-    const char * cdata = (const char *)src->data;
-    const char * const data_end = cdata + ggml_nbytes(src);
-    float * drow = (float *)dst->data;
+    const int64_t IW = src->ne[0];
+    const int64_t OW = dst->ne[0];
 
-    const int64_t rs = dst->ne[0];
+    const int64_t nr = ggml_nrows(src);
 
-    while (cdata < data_end) {
-        const void * srow = (const void *)cdata;
-        int j = 0;
-        for (int64_t i = 0; i < rs; ++i) {
+    for (int64_t ir = 0; ir < nr; ++ir) {
+        const char * srow_bytes =            (const char *) src->data + ir * src->nb[1];
+        float      * drow       = (float *) ((      char *) dst->data + ir * dst->nb[1]);
+
+        for (int64_t ow = 0; ow < OW; ++ow) {
+            float res = 0;
             switch (op) {
-                case GGML_OP_POOL_AVG:   drow[i] = 0;        break;
-                case GGML_OP_POOL_MAX:   drow[i] = -FLT_MAX; break;
+                case GGML_OP_POOL_AVG: res = 0.0f;     break;
+                case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
                 case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
             }
+
+            int count = 0;
+            const int base = (int) ow * s - p;
+
             for (int ki = 0; ki < k; ++ki) {
-                const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]);
-                switch (op) {
-                    case GGML_OP_POOL_AVG:                         drow[i] += srow_j; break;
-                    case GGML_OP_POOL_MAX:   if (srow_j > drow[i]) drow[i]  = srow_j; break;
-                    case GGML_OP_POOL_COUNT:                       GGML_ABORT("fatal error");
+                const int j = base + ki;
+                if (j < 0 || j >= (int) IW) {
+                    continue;
                 }
-                ++j;
+
+                float v;
+                if (src->type == GGML_TYPE_F32) {
+                    v = ((const float *) srow_bytes)[j];
+                } else {
+                    v = GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t *) srow_bytes)[j]);
+                }
+
+                switch (op) {
+                    case GGML_OP_POOL_AVG: res += v;                break;
+                    case GGML_OP_POOL_MAX: res =  std::max(v, res); break;
+                    case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
+                }
+
+                ++count;
             }
+
             switch (op) {
-                case GGML_OP_POOL_AVG:         drow[i] /= k; break;
-                case GGML_OP_POOL_MAX:                       break;
+                case GGML_OP_POOL_AVG: res = (count > 0) ? (res / count) : 0.0f; break;
+                case GGML_OP_POOL_MAX:                                           break;
                 case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
             }
-        }
 
-        cdata += src->nb[1];
-        drow  += rs;
+            drow[ow] = res;
+        }
     }
 }
 
@@ -7173,10 +7190,8 @@ void ggml_compute_forward_pool_1d(
     const int k0 = opts[1];
     const int s0 = opts[2];
     const int p0 = opts[3];
-    GGML_ASSERT(p0 == 0); // padding not supported
-    GGML_ASSERT(k0 == s0); // only s = k supported
 
-    ggml_compute_forward_pool_1d_sk_p0(params, op, k0, dst);
+    ggml_compute_forward_pool_1d_ksp(params, op, k0, s0, p0, dst);
 }
 
 // ggml_compute_forward_pool_2d
@@ -7194,6 +7209,7 @@ void ggml_compute_forward_pool_2d(
     }
 
     const int32_t * opts = (const int32_t *)dst->op_params;
+
     ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
     const int k0 = opts[1];
     const int k1 = opts[2];
@@ -7217,11 +7233,13 @@ void ggml_compute_forward_pool_2d(
     while (cdata < data_end) {
         for (int oy = 0; oy < py; ++oy) {
             float * const drow = dplane + oy * px;
+            float * const out  = drow;
+
             for (int ox = 0; ox < px; ++ox) {
-                float * const out =  drow + ox;
+                float res = 0;
                 switch (op) {
-                    case GGML_OP_POOL_AVG:     *out = 0;        break;
-                    case GGML_OP_POOL_MAX:     *out = -FLT_MAX; break;
+                    case GGML_OP_POOL_AVG: res = 0;        break;
+                    case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
                     case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
                 }
 
@@ -7229,24 +7247,32 @@ void ggml_compute_forward_pool_2d(
                 const int iy = offset1 + oy * s1;
 
                 for (int ky = 0; ky < k1; ++ky) {
-                    if (iy + ky < 0 || iy + ky >= src->ne[1]) continue;
+                    if (iy + ky < 0 || iy + ky >= src->ne[1]) {
+                        continue;
+                    }
+
                     const void * srow = (const void *)(cdata + src->nb[1] * (iy + ky));
                     for (int kx = 0; kx < k0; ++kx) {
                         int j = ix + kx;
-                        if (j < 0 || j >= src->ne[0]) continue;
+                        if (j < 0 || j >= src->ne[0]) {
+                            continue;
+                        }
+
                         const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]);
                         switch (op) {
-                            case GGML_OP_POOL_AVG:                     *out += srow_j; break;
-                            case GGML_OP_POOL_MAX: if (srow_j > *out)  *out  = srow_j; break;
+                            case GGML_OP_POOL_AVG: res += srow_j;                break;
+                            case GGML_OP_POOL_MAX: res =  std::max(srow_j, res); break;
                             case GGML_OP_POOL_COUNT:               GGML_ABORT("fatal error");
                         }
                     }
                 }
                 switch (op) {
-                    case GGML_OP_POOL_AVG:           *out /= ka; break;
-                    case GGML_OP_POOL_MAX:                       break;
+                    case GGML_OP_POOL_AVG:           res /= ka; break;
+                    case GGML_OP_POOL_MAX:                      break;
                     case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
                 }
+
+                out[ox] = res;
             }
         }
 
@@ -8713,8 +8739,6 @@ static void ggml_compute_forward_ssm_conv_f32(
     const int ir1 = MIN(ir0 + dr, nr);
     const int ir  = ir1 - ir0;
 
-    bool do_conv_debug = false; // (ith == 0 && conv_debug_count++ < 3);
-
     for (int i3 = 0; i3 < n_s; ++i3) {
         for (int i2 = 0; i2 < n_t; ++i2) {
             // {d_conv - 1 + n_t, d_inner, n_seqs}
@@ -8735,13 +8759,6 @@ static void ggml_compute_forward_ssm_conv_f32(
                     sumf += s[i0 + i1*ncs] * c[i0 + i1*nc];
                 }
                 x[i1] = sumf;
-
-                // Debug output
-                if (do_conv_debug && i1 == 0 && i2 == 0 && i3 == 0) {
-                    fprintf(stderr, "DEBUG SSM_CONV: nc=%d, nr=%d, n_t=%d, n_s=%d\n", nc, nr, n_t, n_s);
-                    fprintf(stderr, "DEBUG SSM_CONV: s[0..3]=%f,%f,%f,%f, c[0..3]=%f,%f,%f,%f, x[0]=%f\n",
-                            s[0], s[1], s[2], s[3], c[0], c[1], c[2], c[3], x[0]);
-                }
             }
         }
     }

From 560190af9742d2dfb6158e1df2524ace0fd3e282 Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Wed, 21 Jan 2026 22:12:21 +0800
Subject: [PATCH 48/58] fixed find_hparam calls. Fixed e_score_correction_bias
 to use bias instead of weight. Removed all ssm_conv bias terms.

---
 convert_hf_to_gguf.py          | 37 +++++++++++-----------------------
 gguf-py/gguf/tensor_mapping.py |  1 +
 src/llama-model.cpp            | 10 ++++-----
 src/llama-model.h              |  3 ---
 src/models/kimi-linear.cpp     | 11 ++++------
 5 files changed, 21 insertions(+), 41 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 2e80889215..ed650e1246 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -5186,21 +5186,16 @@ class KimiLinearModel(TextModel):
         assert len(_num_kv_heads) == self.hparams["num_hidden_layers"]
         self.gguf_writer.add_head_count_kv(_num_kv_heads)
 
-        ssm_d_conv = self.hparams.get("ssm_d_conv") or linear_attn_config.get("short_conv_kernel_size")
-        if ssm_d_conv is not None:
+        if (ssm_d_conv := linear_attn_config.get("short_conv_kernel_size")) is not None:
             self.gguf_writer.add_ssm_conv_kernel(ssm_d_conv)
-        kda_head_dim = self.hparams.get("kda_head_dim") or linear_attn_config.get("head_dim")
-        if kda_head_dim is not None:
+        if (kda_head_dim := linear_attn_config.get("head_dim")) is not None:
             self.gguf_writer.add_kda_head_dim(kda_head_dim)
 
         # MLA params - use add_* methods that handle arch substitution
         # Support both HuggingFace naming (q_lora_rank, kv_lora_rank) and internal naming (n_lora_q, n_lora_kv)
-        q_lora_rank = self.hparams.get("q_lora_rank", self.hparams.get("n_lora_q"))
-        kv_lora_rank = self.hparams.get("kv_lora_rank", self.hparams.get("n_lora_kv"))
-
-        if q_lora_rank is not None:
+        if (q_lora_rank := self.find_hparam(["q_lora_rank", "n_lora_q"], optional=False)) is not None:
             self.gguf_writer.add_q_lora_rank(q_lora_rank)
-        if kv_lora_rank is not None:
+        if (kv_lora_rank := self.find_hparam(["kv_lora_rank", "n_lora_kv"], optional=False)) is not None:
             self.gguf_writer.add_kv_lora_rank(kv_lora_rank)
 
         # MLA head dimensions
@@ -5226,39 +5221,32 @@ class KimiLinearModel(TextModel):
             self.gguf_writer.add_value_length_mla(v_head_dim)
 
         # Rotation - use qk_rope_head_dim for Kimi
-        rope_dim = self.find_hparam(["qk_rope_head_dim", "n_rot"])
-        if rope_dim is not None:
+        if (rope_dim := self.find_hparam(["qk_rope_head_dim", "n_rot"], optional=True)) is not None:
             self.gguf_writer.add_rope_dimension_count(rope_dim)
         else:
             # Default to head_dim
             head_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
             self.gguf_writer.add_rope_dimension_count(head_dim)
 
-        n_experts = self.find_hparam(["num_experts"])
-        if n_experts is not None:
+        if (n_experts := self.find_hparam(["num_experts"], optional=False)) is not None:
             self.gguf_writer.add_expert_count(n_experts)
-        n_experts_used = self.find_hparam(["num_experts_per_token"])
-        if n_experts_used is not None:
+        if (n_experts_used := self.find_hparam(["num_experts_per_token"], optional=False)) is not None:
             self.gguf_writer.add_expert_used_count(n_experts_used)
 
         # moe_intermediate_size (1024 for Kimi)
-        moe_intermediate_size = self.find_hparam(["moe_intermediate_size"])
-        if moe_intermediate_size is not None:
+        if (moe_intermediate_size := self.find_hparam(["moe_intermediate_size"], optional=False)) is not None:
             self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
 
         # num_shared_experts (1 for Kimi)
-        num_shared_experts = self.find_hparam(["num_shared_experts"])
-        if num_shared_experts is not None:
+        if (num_shared_experts := self.find_hparam(["num_shared_experts"], optional=False)) is not None:
             self.gguf_writer.add_expert_shared_count(num_shared_experts)
 
         # first_k_dense_replace (1 for Kimi - first layer uses dense MLP)
-        first_k_dense_replace = self.find_hparam(["first_k_dense_replace"])
-        if first_k_dense_replace is not None:
+        if (first_k_dense_replace := self.find_hparam(["first_k_dense_replace"])) is not None:
             self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace)
 
         # Routed scaling factor (expert_weights_scale = 2.446 for Kimi)
-        routed_scaling_factor = self.find_hparam(["routed_scaling_factor"])
-        if routed_scaling_factor is not None:
+        if (routed_scaling_factor := self.find_hparam(["routed_scaling_factor"], optional=False)) is not None:
             self.gguf_writer.add_expert_weights_scale(routed_scaling_factor)
 
     def prepare_tensors(self):
@@ -5292,8 +5280,7 @@ class KimiLinearModel(TextModel):
 
         # Kimi specific bias
         if name.endswith("e_score_correction_bias"):
-            new_name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_EXP_PROBS_B, bid)
-            return [(new_name, data_torch)]
+            name = name.replace("e_score_correction_bias", "e_score_correction.bias")
 
         # Handle A_log: iHF stores as [1, 1, num_heads, 1]
         # llama.cpp expects ggml ne = [1, num_heads, 1, 1]
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index d96119ebe9..e16c06c2a3 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -438,6 +438,7 @@ class TensorNameMap:
             "model.layers.{bid}.block_sparse_moe.e_score_correction",       # minimax-m2
             "backbone.layers.{bid}.mixer.gate.e_score_correction",          # nemotron-h-moe
             "model.layers.{bid}.mlp.e_score_correction",                    # exaone-moe
+            "model.layers.{bid}.block_sparse_moe.gate.e_score_correction",  # kimi
         ),
 
         # Feed-forward up
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 7195346fd8..4ea23dca53 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -6825,11 +6825,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                                  layer.ssm_v_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_V, "weight", i), {ssm_d_conv, 1, n_embd_head_v_kda * n_head}, 0);
                              }
 
-                             // Conv bias may not exist in all models - make optional
-                             layer.ssm_q_conv_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "bias", i), {n_embd_head_k_kda * n_head}, TENSOR_NOT_REQUIRED);
-                             layer.ssm_k_conv_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_K, "bias", i), {n_embd_head_k_kda * n_head}, TENSOR_NOT_REQUIRED);
-                             layer.ssm_v_conv_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_V, "bias", i), {n_embd_head_v_kda * n_head}, TENSOR_NOT_REQUIRED);
-
                              // q, k, v projections
                              // Python: q_proj, k_proj, v_proj
                              layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k_kda * n_head}, 0);
@@ -6923,7 +6918,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                             layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_shexp_actual}, TENSOR_NOT_REQUIRED);
 
                             // exp_probs_b (e_score_correction_bias in vLLM)
-                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "weight", i), {n_expert}, 0);
+                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
+                            if (!layer.ffn_exp_probs_b) {
+                                layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "weight", i), {n_expert}, TENSOR_NOT_REQUIRED);
+                            }
                         }
                     }
                 } break;
diff --git a/src/llama-model.h b/src/llama-model.h
index 208766bacf..359701589c 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -415,11 +415,8 @@ struct llama_layer {
     // Kimi Linear KDA (using ssm_ prefix for consistency)
     // Note: ssm_dt_b already exists above (mamba bias), reused for Kimi dt_bias
     struct ggml_tensor * ssm_q_conv = nullptr;
-    struct ggml_tensor * ssm_q_conv_b = nullptr;
     struct ggml_tensor * ssm_k_conv = nullptr;
-    struct ggml_tensor * ssm_k_conv_b = nullptr;
     struct ggml_tensor * ssm_v_conv = nullptr;
-    struct ggml_tensor * ssm_v_conv_b = nullptr;
     struct ggml_tensor * ssm_f_a    = nullptr;
     struct ggml_tensor * ssm_f_b    = nullptr;
     struct ggml_tensor * ssm_beta   = nullptr;
diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp
index 6013cd0b77..721bef9e7f 100644
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@@ -5,7 +5,7 @@
 
 // Causal Conv1d function for Q,K,V
 // When qkv is 0, it is Q, 1 is K, 2 is V
-static ggml_tensor * causal_conv1d(ggml_cgraph * gf, ggml_context * ctx0, ggml_tensor * conv_states_all, ggml_tensor * conv_state_all, int64_t qkv, ggml_tensor * x, ggml_tensor * proj_w, ggml_tensor * conv_w, ggml_tensor * conv_b, int64_t d_conv, int64_t head_dim, int64_t n_head, int64_t n_seq_tokens, int64_t n_seqs, int64_t n_tokens, int64_t kv_head) {
+static ggml_tensor * causal_conv1d(ggml_cgraph * gf, ggml_context * ctx0, ggml_tensor * conv_states_all, ggml_tensor * conv_state_all, int64_t qkv, ggml_tensor * x, ggml_tensor * proj_w, ggml_tensor * conv_w, int64_t d_conv, int64_t head_dim, int64_t n_head, int64_t n_seq_tokens, int64_t n_seqs, int64_t n_tokens, int64_t kv_head) {
     const int64_t d_inner = head_dim * n_head;
     const int64_t conv_state_size = (d_conv - 1) * d_inner;
     const int64_t n_embd_r_total = 3 * conv_state_size;  // Q + K + V
@@ -56,9 +56,6 @@ static ggml_tensor * causal_conv1d(ggml_cgraph * gf, ggml_context * ctx0, ggml_t
     ggml_tensor * Xcur = ggml_ssm_conv(ctx0, conv_x, conv_weight);
     // Reshape to 2D for bias add: {d_inner, n_tokens}
     Xcur = ggml_reshape_2d(ctx0, Xcur, d_inner, n_tokens);
-    if (conv_b) {
-        Xcur = ggml_add(ctx0, Xcur, conv_b);
-    }
     Xcur = ggml_silu(ctx0, Xcur);
 
     return ggml_reshape_4d(ctx0, Xcur, head_dim, n_head, n_seq_tokens, n_seqs);
@@ -140,9 +137,9 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
             ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
             cb(conv_states_all, "conv_states_all", il);
             ggml_tensor * conv_state_all = build_rs(inp_rs, conv_states_all, hparams.n_embd_r(), n_seqs);
-            ggml_tensor * Qcur = causal_conv1d(gf, ctx0, conv_states_all, conv_state_all, 0, cur, layer.wq, layer.ssm_q_conv, layer.ssm_q_conv_b, d_conv, head_dim, n_head, n_seq_tokens, n_seqs, n_tokens, kv_head);
-            ggml_tensor * Kcur = causal_conv1d(gf, ctx0, conv_states_all, conv_state_all, 1, cur, layer.wk, layer.ssm_k_conv, layer.ssm_k_conv_b, d_conv, head_dim, n_head, n_seq_tokens, n_seqs, n_tokens, kv_head);
-            ggml_tensor * Vcur = causal_conv1d(gf, ctx0, conv_states_all, conv_state_all, 2, cur, layer.wv, layer.ssm_v_conv, layer.ssm_v_conv_b, d_conv, head_dim, n_head, n_seq_tokens, n_seqs, n_tokens, kv_head);
+            ggml_tensor * Qcur = causal_conv1d(gf, ctx0, conv_states_all, conv_state_all, 0, cur, layer.wq, layer.ssm_q_conv, d_conv, head_dim, n_head, n_seq_tokens, n_seqs, n_tokens, kv_head);
+            ggml_tensor * Kcur = causal_conv1d(gf, ctx0, conv_states_all, conv_state_all, 1, cur, layer.wk, layer.ssm_k_conv, d_conv, head_dim, n_head, n_seq_tokens, n_seqs, n_tokens, kv_head);
+            ggml_tensor * Vcur = causal_conv1d(gf, ctx0, conv_states_all, conv_state_all, 2, cur, layer.wv, layer.ssm_v_conv, d_conv, head_dim, n_head, n_seq_tokens, n_seqs, n_tokens, kv_head);
 
             // g1 = -exp(A_log) * softplus(f_b(f_a(x)) + dt_bias)
             ggml_tensor * f_a = ggml_mul_mat(ctx0, layer.ssm_f_a, cur);

From ae8d710c39886e12422fd564ecd0a278057cbdcd Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Thu, 22 Jan 2026 07:06:17 +0800
Subject: [PATCH 49/58] remove DT_B from constants.py. remove one comment line
 in llama-model.cpp

---
 gguf-py/gguf/constants.py | 1 -
 src/llama-model.cpp       | 1 -
 2 files changed, 2 deletions(-)

diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 73e7bae6e1..a51b3d87d1 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -559,7 +559,6 @@ class MODEL_TENSOR(IntEnum):
     SSM_F_A              = auto() # Kimi Linear
     SSM_F_B              = auto() # Kimi Linear
     SSM_BETA             = auto() # Kimi Linear
-    SSM_DT_B             = auto() # Kimi Linear
     SSM_G_A              = auto() # Kimi Linear
     SSM_G_B              = auto() # Kimi Linear
     TIME_MIX_W0          = auto()
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 4ea23dca53..edf7108fd7 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -6917,7 +6917,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                             layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp_actual, n_embd}, TENSOR_NOT_REQUIRED);
                             layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_shexp_actual}, TENSOR_NOT_REQUIRED);
 
-                            // exp_probs_b (e_score_correction_bias in vLLM)
                             layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
                             if (!layer.ffn_exp_probs_b) {
                                 layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "weight", i), {n_expert}, TENSOR_NOT_REQUIRED);

From f1525b36959e24776c57031f00fce0212cc3eff8 Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Tue, 27 Jan 2026 11:25:13 +0800
Subject: [PATCH 50/58] new class llm_graph_input_mem_hybrid_k to get around
 the new MLA change. switch the concat order of ggml_concat calls in
 kimi-linear.cpp to accommodate MLA changes. Removed support for
 exp_probs_b.weight

---
 src/llama-graph.cpp        | 52 ++++++++++++++++++++++++++++++++++++++
 src/llama-graph.h          | 29 +++++++++++++++++++++
 src/llama-model.cpp        | 30 ++++++++++------------
 src/models/kimi-linear.cpp | 12 ++++-----
 4 files changed, 100 insertions(+), 23 deletions(-)

diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 06d0d4c558..1aebc012a1 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -533,6 +533,47 @@ bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) {
     return res;
 }
 
+void llm_graph_input_mem_hybrid_k::set_input(const llama_ubatch * ubatch) {
+    mctx->get_attn()->set_input_k_idxs(inp_attn->self_k_idxs, ubatch);
+
+    mctx->get_attn()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn);
+
+    const int64_t n_rs = mctx->get_recr()->get_n_rs();
+
+    if (inp_rs->s_copy) {
+        GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer));
+        int32_t * data = (int32_t *) inp_rs->s_copy->data;
+
+        // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
+        for (uint32_t i = 0; i < n_rs; ++i) {
+            data[i] = mctx->get_recr()->s_copy(i);
+        }
+    }
+}
+
+bool llm_graph_input_mem_hybrid_k::can_reuse(const llm_graph_params & params) {
+    const auto * mctx = static_cast<const llama_memory_hybrid_context *>(params.mctx);
+
+    this->mctx = mctx;
+
+    bool res = true;
+
+    res &= inp_attn->self_k_idxs->ne[0] == params.ubatch.n_tokens;
+
+    res &= inp_attn->self_kq_mask->ne[0] == mctx->get_attn()->get_n_kv();
+    res &= inp_attn->self_kq_mask->ne[1] == params.ubatch.n_tokens;
+
+    res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
+
+    res &= inp_rs->s_copy_main->ne[0]  == params.ubatch.n_seqs;
+    res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
+
+    res &= inp_rs->head == mctx->get_recr()->get_head();
+    res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z();
+
+    return res;
+}
+
 void llm_graph_input_mem_hybrid_iswa::set_input(const llama_ubatch * ubatch) {
     const auto * attn_ctx = mctx->get_attn();
 
@@ -2272,6 +2313,17 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
     return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp));
 }
 
+llm_graph_input_mem_hybrid_k * llm_graph_context::build_inp_mem_hybrid_k() const {
+    const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx);
+
+    auto inp_rs   = build_rs_inp_impl     (ctx0, ubatch, mctx_cur->get_recr());
+    auto inp_attn = build_attn_inp_k_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn());
+
+    auto inp = std::make_unique<llm_graph_input_mem_hybrid_k>(cparams, std::move(inp_attn), std::move(inp_rs), mctx_cur);
+
+    return (llm_graph_input_mem_hybrid_k *) res->add_input(std::move(inp));
+}
+
 llm_graph_input_mem_hybrid_iswa * llm_graph_context::build_inp_mem_hybrid_iswa() const {
     const auto * mctx_cur = static_cast<const llama_memory_hybrid_iswa_context *>(mctx);
 
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 4090d8116c..1d69ff1a6f 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -433,6 +433,34 @@ public:
     const llama_memory_hybrid_context * mctx;
 };
 
+class llm_graph_input_mem_hybrid_k : public llm_graph_input_i {
+public:
+    llm_graph_input_mem_hybrid_k(
+            const llama_cparams & cparams,
+            std::unique_ptr<llm_graph_input_attn_k> inp_attn,
+            std::unique_ptr<llm_graph_input_rs>      inp_rs,
+            const llama_memory_hybrid_context *      mctx) :
+        inp_attn(std::move(inp_attn)),
+        inp_rs(std::move(inp_rs)),
+        cparams(cparams),
+        mctx(mctx) { }
+    virtual ~llm_graph_input_mem_hybrid_k() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    bool can_reuse(const llm_graph_params & params) override;
+
+    std::unique_ptr<llm_graph_input_attn_k> inp_attn;
+    std::unique_ptr<llm_graph_input_rs>      inp_rs;
+
+    llm_graph_input_attn_k * get_attn() const { return inp_attn.get(); }
+    llm_graph_input_rs      * get_recr() const { return inp_rs.get(); }
+
+    const llama_cparams cparams;
+
+    const llama_memory_hybrid_context * mctx;
+};
+
 class llm_graph_input_mem_hybrid_iswa : public llm_graph_input_i {
 public:
     llm_graph_input_mem_hybrid_iswa(
@@ -960,6 +988,7 @@ struct llm_graph_context {
     //
 
     llm_graph_input_mem_hybrid * build_inp_mem_hybrid() const;
+    llm_graph_input_mem_hybrid_k * build_inp_mem_hybrid_k() const;
 
     llm_graph_input_mem_hybrid_iswa * build_inp_mem_hybrid_iswa() const;
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 67b0314de9..84ac4d3a9e 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2454,12 +2454,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
         case LLM_ARCH_KIMI_LINEAR:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA,    hparams.n_embd_head_k_mla, false);
-                ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA,  hparams.n_embd_head_v_mla, false);
-                ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK,      hparams.n_lora_kv, false);
-                ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT,        hparams.n_rot, false);
-                ml.get_key(LLM_KV_SSM_CONV_KERNEL,             hparams.ssm_d_conv, false);
-                ml.get_key(LLM_KV_KDA_HEAD_DIM,                hparams.kda_head_dim, false);
+                ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA,    hparams.n_embd_head_k_mla_impl);
+                ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA,  hparams.n_embd_head_v_mla_impl);
+                ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK,      hparams.n_lora_kv);
+                ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT,        hparams.n_rot);
+                ml.get_key(LLM_KV_SSM_CONV_KERNEL,             hparams.ssm_d_conv);
+                ml.get_key(LLM_KV_KDA_HEAD_DIM,                hparams.kda_head_dim);
 
                 // MLA qk_rope_head_dim (for reference)
                 // qk_rope_head_dim = 64, qk_nope_head_dim = 128, qk_head_dim = 192
@@ -2471,11 +2471,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 }
 
                 // MoE parameters - Kimi uses moe_intermediate_size = 1024
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp, false);
-                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
-                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,               hparams.n_expert_shared, false);
-                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead, false);
-                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
+                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
+                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,               hparams.n_expert_shared);
+                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead);
+                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale);
                 ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func);
 
                 switch (hparams.n_layer) {
@@ -6863,8 +6862,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                              // MLA Layer - use MLA-specific head dimensions
                              const int64_t q_lora_rank  = hparams.n_lora_q;
                              const int64_t kv_lora_rank = hparams.n_lora_kv;
-                             const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla;
-                             const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla;
+                             const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
+                             const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
 
                              layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, TENSOR_NOT_REQUIRED);
                              layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
@@ -6917,10 +6916,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                             layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp_actual, n_embd}, TENSOR_NOT_REQUIRED);
                             layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_shexp_actual}, TENSOR_NOT_REQUIRED);
 
-                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
-                            if (!layer.ffn_exp_probs_b) {
-                                layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "weight", i), {n_expert}, TENSOR_NOT_REQUIRED);
-                            }
+                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
                         }
                     }
                 } break;
diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp
index 721bef9e7f..3ea404dd0b 100644
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@@ -72,7 +72,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
     // Note: Kimi MLA does NOT use RoPE (rotary_emb=None in vLLM)
     // So we don't need inp_pos
 
-    auto * inp = build_inp_mem_hybrid();
+    auto * inp = build_inp_mem_hybrid_k();
     auto * inp_rs = inp->get_recr();
     auto * inp_attn = inp->get_attn();
 
@@ -104,8 +104,8 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
     GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
 
     // MLA params
-    const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla;
-    const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla;
+    const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
+    const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
     const int64_t kv_lora_rank = hparams.n_lora_kv;
     // qk_rope_head_dim = 64 (from Kimi config) which is hparams.n_rot
     // Confirmed from tensor shape: wkv_a_mqa [2304, 576] = [n_embd, kv_lora_rank + qk_rope_head_dim]
@@ -258,14 +258,14 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
 
                 // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
                 // note: rope must go first for in-place context shifting in build_rope_shift()
-                Qcur = ggml_concat(ctx0, q_pe, q_nope_absorbed, 0);
+                Qcur = ggml_concat(ctx0, q_nope_absorbed, q_pe, 0);
                 cb(Qcur, "Qcur", il);
 
                 kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens);
                 cb(kv_cmpr, "kv_cmpr_reshape", il);
 
                 // {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
-                ggml_tensor * Kcur = ggml_concat(ctx0, k_pe, kv_cmpr, 0);
+                ggml_tensor * Kcur = ggml_concat(ctx0, kv_cmpr, k_pe, 0);
                 cb(Kcur, "Kcur", il);
 
                 // {kv_lora_rank, 1, n_tokens}
@@ -299,7 +299,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
                 // Need to broadcast k_pe from [qk_rope, 1, n_tokens] to [qk_rope, n_head, n_tokens]
                 ggml_tensor * k_pe_target = ggml_new_tensor_3d(ctx0, k_pe->type, n_embd_head_qk_rope, n_head, n_tokens);
                 ggml_tensor * k_pe_repeated = ggml_repeat(ctx0, k_pe, k_pe_target);
-                ggml_tensor * Kcur = ggml_concat(ctx0, k_nope, k_pe_repeated, 0);
+                ggml_tensor * Kcur = ggml_concat(ctx0, k_pe_repeated, k_nope, 0);
                 cb(Kcur, "mla_K", il);
 
                 // Direct softmax attention (with MHA KV cache)

From 0de4680bdfdb3603b78b23d69b789d0a14547155 Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Tue, 27 Jan 2026 13:19:06 +0800
Subject: [PATCH 51/58] remove ssm_o_norm_b

---
 src/llama-model.cpp | 1 -
 src/llama-model.h   | 1 -
 2 files changed, 2 deletions(-)

diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 84ac4d3a9e..50900feb2c 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -6853,7 +6853,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
                              // o_norm (reusing SSM_NORM)
                              layer.ssm_o_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {n_embd_head_k_kda}, 0); // FusedRMSNormGated
-                             layer.ssm_o_norm_b = create_tensor(tn(LLM_TENSOR_SSM_NORM, "bias", i), {n_embd_head_k_kda}, TENSOR_NOT_REQUIRED);
 
                              // o_proj
                              layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v_kda * n_head, n_embd}, 0);
diff --git a/src/llama-model.h b/src/llama-model.h
index 359701589c..5b408bcea2 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -423,7 +423,6 @@ struct llama_layer {
     struct ggml_tensor * ssm_g_a    = nullptr;
     struct ggml_tensor * ssm_g_b    = nullptr;
     struct ggml_tensor * ssm_o_norm = nullptr;
-    struct ggml_tensor * ssm_o_norm_b = nullptr;
 
     struct llama_layer_posnet posnet;
 

From 0444a4faa0660636a3246982b951104bfdb1df1e Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Tue, 27 Jan 2026 13:19:55 +0800
Subject: [PATCH 52/58] remove ssm_o_norm_b

---
 src/models/kimi-linear.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp
index 3ea404dd0b..40007a6fa3 100644
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@@ -197,7 +197,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
             // Note: Kimi model uses sigmoid gating, not SiLU (despite FusedRMSNormGated default being swish)
             // Formula: output = RMSNorm(x) * sigmoid(g)
             ggml_tensor * attn_out_final = ggml_reshape_3d(ctx0, output, head_dim, n_head,  n_seq_tokens * n_seqs);
-            ggml_tensor * normed = build_norm(attn_out_final, layer.ssm_o_norm, layer.ssm_o_norm_b, LLM_NORM_RMS, il);
+            ggml_tensor * normed = build_norm(attn_out_final, layer.ssm_o_norm, nullptr, LLM_NORM_RMS, il);
             cb(normed, "kda_normed", il);
             ggml_tensor * gate = ggml_sigmoid(ctx0, g2);
             ggml_tensor * gated = ggml_mul(ctx0, normed, gate);

From a6b2c450c8ea9bdf9995a3e5443512dd5d3c96c4 Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Thu, 29 Jan 2026 08:35:35 +0800
Subject: [PATCH 53/58] changed hparams.kda_head_dim to
 hparams.n_embd_head_kda. added TODO comment for class
 llama_graph_mem_hybrid_k

---
 src/llama-graph.cpp        | 3 +++
 src/llama-hparams.cpp      | 8 ++++----
 src/llama-hparams.h        | 2 +-
 src/llama-model.cpp        | 6 +++---
 src/models/kimi-linear.cpp | 2 +-
 5 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 1aebc012a1..ac143bf031 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -533,6 +533,9 @@ bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) {
     return res;
 }
 
+// TODO: Hybrid input classes are a bit redundant. 
+// Instead of creating a hybrid input, the graph can simply create 2 separate inputs.
+// Refactoring is required in the future.
 void llm_graph_input_mem_hybrid_k::set_input(const llama_ubatch * ubatch) {
     mctx->get_attn()->set_input_k_idxs(inp_attn->self_k_idxs, ubatch);
 
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index 873c65cea8..756dda1a7a 100644
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -139,10 +139,10 @@ uint32_t llama_hparams::n_embd_r() const {
         return n_embd * (n_shortconv_l_cache - 1);
     }
 
-    if (kda_head_dim != 0) {
+    if (n_embd_head_kda != 0) {
         // for Kimi KDA layers
         // Conv state for Q, K, V: 3 * (d_conv - 1) * n_head * head_dim
-        const uint32_t d_inner = n_head() * kda_head_dim;  // 32 * 128 = 4096
+        const uint32_t d_inner = n_head() * n_embd_head_kda;  // 32 * 128 = 4096
         return 3 * (ssm_d_conv > 0 ? ssm_d_conv - 1 : 3) * d_inner;
     }
 
@@ -158,11 +158,11 @@ uint32_t llama_hparams::n_embd_s() const {
         return n_embd * wkv_head_size;
     }
 
-    if (kda_head_dim != 0) {
+    if (n_embd_head_kda != 0) {
         // for Kimi KDA layers
         // Full recurrent state: head_dim * head_dim * n_head
         // h tensor shape for delta attention: [head_dim, head_dim, n_head]
-        return kda_head_dim * kda_head_dim * n_head();  // 128 * 128 * 32 = 524288
+        return n_embd_head_kda * n_embd_head_kda * n_head();  // 128 * 128 * 32 = 524288
     }
 
     // corresponds to Mamba's ssm_states size
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 943161747c..a736ccc3d0 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -138,7 +138,7 @@ struct llama_hparams {
     uint32_t ssm_n_group = 0;
 
     // for Kimi Linear KDA
-    uint32_t kda_head_dim = 0;
+    uint32_t n_embd_head_kda = 0;
 
     // for hybrid state space models
     std::array<bool, LLAMA_MAX_LAYERS> recurrent_layer_arr;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 50900feb2c..40f3ff6e49 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2459,7 +2459,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK,      hparams.n_lora_kv);
                 ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT,        hparams.n_rot);
                 ml.get_key(LLM_KV_SSM_CONV_KERNEL,             hparams.ssm_d_conv);
-                ml.get_key(LLM_KV_KDA_HEAD_DIM,                hparams.kda_head_dim);
+                ml.get_key(LLM_KV_KDA_HEAD_DIM,                hparams.n_embd_head_kda);
 
                 // MLA qk_rope_head_dim (for reference)
                 // qk_rope_head_dim = 64, qk_nope_head_dim = 128, qk_head_dim = 192
@@ -6801,8 +6801,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         // Assuming KDA layer if KDA tensors are present
 
                         // KDA uses head_dim = 128 (from linear_attn_config.head_dim)
-                        const int64_t n_embd_head_k_kda = hparams.kda_head_dim;
-                        const int64_t n_embd_head_v_kda = hparams.kda_head_dim;
+                        const int64_t n_embd_head_k_kda = hparams.n_embd_head_kda;
+                        const int64_t n_embd_head_v_kda = hparams.n_embd_head_kda;
                         const int64_t ssm_d_conv = hparams.ssm_d_conv;
 
                         // Try loading KDA specific tensors (using SSM_ prefix)
diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp
index 40007a6fa3..5f497722d0 100644
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@@ -92,7 +92,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
 
     // Kimi dimension constants
     const int64_t n_head = hparams.n_head();
-    const int64_t head_dim = hparams.kda_head_dim;
+    const int64_t head_dim = hparams.n_embd_head_kda;
     const int64_t d_conv = hparams.ssm_d_conv;
     const int64_t d_inner = n_head * head_dim;  // 32 * 128 = 4096
     const int64_t n_seqs = ubatch.n_seqs;

From 6216273edefbdd393ad8f986d4d86613e9886c7f Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Thu, 29 Jan 2026 08:46:33 +0800
Subject: [PATCH 54/58] removed all ggml_cont b4 ggml_reshape_4d

---
 src/models/kimi-linear.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp
index 5f497722d0..a7e5482008 100644
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@@ -540,9 +540,9 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_kimi_linear::build_kda_chunkin
     // decay_mask [S_k,BT_j,BT_i,CHB] *Note* second and third chunk_sizes are switched
     decay_mask = ggml_cont_4d(ctx0, ggml_permute(ctx0, decay_mask, 2, 1, 0, 3), S_k, chunk_size, chunk_size, CHB);
 
-    ggml_tensor * k_i = ggml_cont(ctx0, ggml_reshape_4d(ctx0, k, S_k, chunk_size, 1, CHB));
-    ggml_tensor * k_j = ggml_cont(ctx0, ggml_reshape_4d(ctx0, k, S_k, 1, chunk_size, CHB));
-    ggml_tensor * q_i = ggml_cont(ctx0, ggml_reshape_4d(ctx0, q, S_k, chunk_size, 1, CHB));
+    ggml_tensor * k_i = ggml_reshape_4d(ctx0, k, S_k, chunk_size, 1, CHB);
+    ggml_tensor * k_j = ggml_reshape_4d(ctx0, k, S_k, 1, chunk_size, CHB);
+    ggml_tensor * q_i = ggml_reshape_4d(ctx0, q, S_k, chunk_size, 1, CHB);
 
     ggml_tensor * decay_k_i = ggml_mul(ctx0, decay_mask, k_i);
     ggml_tensor * decay_q_i = ggml_mul(ctx0, decay_mask, q_i);

From 005c34067ae7437e9a8e536281afb42206c9795f Mon Sep 17 00:00:00 2001
From: "Piotr Wilkin (ilintar)" <piotr.wilkin@syndatis.com>
Date: Fri, 30 Jan 2026 13:38:45 +0100
Subject: [PATCH 55/58] Whitespace

---
 src/llama-graph.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index ac143bf031..bd16ffd113 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -533,7 +533,7 @@ bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) {
     return res;
 }
 
-// TODO: Hybrid input classes are a bit redundant. 
+// TODO: Hybrid input classes are a bit redundant.
 // Instead of creating a hybrid input, the graph can simply create 2 separate inputs.
 // Refactoring is required in the future.
 void llm_graph_input_mem_hybrid_k::set_input(const llama_ubatch * ubatch) {

From aaf05bddc7522cdb08eae655117cb78ebf63d35a Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Sat, 31 Jan 2026 08:46:19 +0800
Subject: [PATCH 56/58] replaced all hparams.get with find_hparams

---
 convert_hf_to_gguf.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index a0681d4889..a1b4401198 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -5096,7 +5096,7 @@ class KimiLinearModel(TextModel):
 
         # KDA & MLA params
         # Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv
-        linear_attn_config = self.hparams.get("linear_attn_config", {})
+        linear_attn_config = self.find_hparam(["linear_attn_config"], optional=False)
         # n_head == 0 for KDA layers, n_head > 0 for MLA layers
         # full_attention_layers list will be used to distingush layer type
         _num_kv_heads = list()
@@ -5123,23 +5123,24 @@ class KimiLinearModel(TextModel):
 
         # MLA head dimensions
         # Support HuggingFace naming: qk_nope_head_dim, qk_rope_head_dim, v_head_dim
-        qk_nope_head_dim = self.hparams.get("qk_nope_head_dim")
-        qk_rope_head_dim = self.hparams.get("qk_rope_head_dim")
-        v_head_dim = self.hparams.get("v_head_dim")
+        qk_nope_head_dim = self.find_hparam(["qk_nope_head_dim"], optional=False)
+        qk_rope_head_dim = self.find_hparam(["qk_rope_head_dim"], optional=False)
+        v_head_dim = self.find_hparam(["v_head_dim"], optional=False)
+        kv_lora_rank = self.find_hparam(["kv_lora_rank"], optional=False)
         # To enable MLA KV cache, MLA needs to be converted into MQA with larger heads, then decompresses to MHA
-        self.gguf_writer.add_key_length(self.hparams["kv_lora_rank"] + self.hparams["qk_rope_head_dim"])
-        self.gguf_writer.add_value_length(self.hparams["kv_lora_rank"])
+        self.gguf_writer.add_key_length(kv_lora_rank + qk_rope_head_dim)
+        self.gguf_writer.add_value_length(kv_lora_rank)
 
         # Calculate n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim
-        if "n_embd_head_k_mla" in self.hparams:
-            self.gguf_writer.add_key_length_mla(self.hparams["n_embd_head_k_mla"])
+        if (n_embd_head_k_mla := self.find_hparam(["n_embd_head_k_mla"], optional=True)) is not None:
+            self.gguf_writer.add_key_length_mla(n_embd_head_k_mla)
         elif qk_nope_head_dim is not None and qk_rope_head_dim is not None:
             n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim
             self.gguf_writer.add_key_length_mla(n_embd_head_k_mla)
 
         # n_embd_head_v_mla = v_head_dim
-        if "n_embd_head_v_mla" in self.hparams:
-            self.gguf_writer.add_value_length_mla(self.hparams["n_embd_head_v_mla"])
+        if (n_embd_head_v_mla := self.find_hparam(["n_embd_head_v_mla"], optional=True)) is not None:
+            self.gguf_writer.add_value_length_mla(n_embd_head_v_mla)
         elif v_head_dim is not None:
             self.gguf_writer.add_value_length_mla(v_head_dim)
 
@@ -5216,7 +5217,7 @@ class KimiLinearModel(TextModel):
 
         # process the experts separately
         if name.find("block_sparse_moe.experts") != -1:
-            n_experts = self.hparams.get("num_local_experts", self.hparams.get("num_experts"))
+            n_experts = self.find_hparam(["num_experts"], optional=False)
             assert bid is not None
 
             if self._experts is None:

From 2c8cd844d0c4d8a1a64403dab4f0017acd23ba06 Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Sun, 1 Feb 2026 08:42:01 +0800
Subject: [PATCH 57/58] added new names for n_experts, n_experts_used and
 score_func in TextModel and removed their code in KimiLinear in
 convert_hf_to_gguf.py. Removed unnecessary ggml_cont and GGML_ASSERT in
 kimi-linear.cpp

---
 convert_hf_to_gguf.py      | 34 +++++++++-------------------------
 src/models/kimi-linear.cpp | 13 +------------
 2 files changed, 10 insertions(+), 37 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index a1b4401198..08e4a12e45 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -907,10 +907,10 @@ class TextModel(ModelBase):
         if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
             self.gguf_writer.add_layer_norm_eps(f_norm_eps)
             logger.info(f"gguf: layer norm epsilon = {f_norm_eps}")
-        if (n_experts := self.hparams.get("num_local_experts")) is not None:
+        if (n_experts := self.find_hparam(["num_local_experts", "num_experts"], optional=True)) is not None:
             self.gguf_writer.add_expert_count(n_experts)
             logger.info(f"gguf: expert count = {n_experts}")
-        if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
+        if (n_experts_used := self.find_hparam(["num_experts_per_tok", "num_experts_per_token"], optional=True)) is not None:
             self.gguf_writer.add_expert_used_count(n_experts_used)
             logger.info(f"gguf: experts used count = {n_experts_used}")
         if (n_expert_groups := self.hparams.get("n_group")) is not None:
@@ -920,7 +920,7 @@ class TextModel(ModelBase):
             self.gguf_writer.add_expert_group_used_count(n_group_used)
             logger.info(f"gguf: expert groups used count = {n_group_used}")
 
-        if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func"], optional=True)) is not None:
+        if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func", "moe_router_activation_func"], optional=True)) is not None:
             if score_func == "sigmoid":
                 self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
             elif score_func == "softmax":
@@ -5086,14 +5086,6 @@ class KimiLinearModel(TextModel):
         super().set_gguf_parameters()
         self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
 
-        if (score_func := self.find_hparam(["moe_router_activation_func"], optional=True)) is not None:
-            if score_func == "sigmoid":
-                self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
-            elif score_func == "softmax":
-                self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
-            else:
-                raise ValueError(f"Unsupported expert score gating function value: {score_func}")
-
         # KDA & MLA params
         # Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv
         linear_attn_config = self.find_hparam(["linear_attn_config"], optional=False)
@@ -5152,11 +5144,6 @@ class KimiLinearModel(TextModel):
             head_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
             self.gguf_writer.add_rope_dimension_count(head_dim)
 
-        if (n_experts := self.find_hparam(["num_experts"], optional=False)) is not None:
-            self.gguf_writer.add_expert_count(n_experts)
-        if (n_experts_used := self.find_hparam(["num_experts_per_token"], optional=False)) is not None:
-            self.gguf_writer.add_expert_used_count(n_experts_used)
-
         # moe_intermediate_size (1024 for Kimi)
         if (moe_intermediate_size := self.find_hparam(["moe_intermediate_size"], optional=False)) is not None:
             self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
@@ -5227,7 +5214,6 @@ class KimiLinearModel(TextModel):
 
             if len(self._experts[bid]) >= n_experts * 3:
                 # merge the experts into a single 3d tensor
-                tensors = []
                 # w1: gate, w2: down, w3: up
                 for wid, tname in [("w1", gguf.MODEL_TENSOR.FFN_GATE_EXP),
                                    ("w2", gguf.MODEL_TENSOR.FFN_DOWN_EXP),
@@ -5237,12 +5223,10 @@ class KimiLinearModel(TextModel):
                         ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight"
                         datas.append(self._experts[bid][ename])
                         del self._experts[bid][ename]
-
                     data_torch = torch.stack(datas, dim=0)
                     new_name = self.format_tensor_name(tname, bid)
-                    tensors.append((new_name, data_torch))
-                return tensors
-            return []
+                    yield from super().modify_tensors(data_torch, new_name, bid)
+            return
 
         # note: MLA with the absorption optimization, needs these two split and k_b_proj transposed
         if name.endswith("kv_b_proj.weight"):
@@ -5256,11 +5240,11 @@ class KimiLinearModel(TextModel):
             kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1])
             k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1)
             k_b = k_b.transpose(1, 2)
-            return [(self.map_tensor_name(name_kb), k_b), (self.map_tensor_name(name_vb), v_b)]
+            yield from super().modify_tensors(k_b, name_kb, bid)
+            yield from super().modify_tensors(v_b, name_vb, bid)
+            return
 
-        mapped_name = self.map_tensor_name(name)
-        logger.info(f"Returning {mapped_name}: shape after = {tuple(data_torch.shape)}")
-        return [(mapped_name, data_torch)]
+        yield from super().modify_tensors(data_torch, name, bid)
 
 
 @ModelBase.register("InternLM2ForCausalLM")
diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp
index a7e5482008..83349cc9ec 100644
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@@ -34,7 +34,7 @@ static ggml_tensor * causal_conv1d(ggml_cgraph * gf, ggml_context * ctx0, ggml_t
     ggml_tensor * x_3d = ggml_reshape_3d(ctx0, x_proj, d_inner, n_seq_tokens, n_seqs);
 
     // Concat Q conv state and current input: {d_conv-1 + n_seq_tokens, d_inner, n_seqs}
-    ggml_tensor * conv_x = ggml_cont(ctx0, ggml_concat(ctx0, conv_state_x, ggml_transpose(ctx0, x_3d), 0));
+    ggml_tensor * conv_x = ggml_concat(ctx0, conv_state_x, ggml_transpose(ctx0, x_3d), 0);
 
     // Save last (d_conv-1) columns back to Q conv state
     ggml_tensor * last_conv_x = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs,
@@ -289,8 +289,6 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
                     ggml_row_size(kv->type, kv_per_head),
                     ggml_row_size(kv->type, kv_per_head * n_head),
                     ggml_row_size(kv->type, n_embd_head_qk_nope));
-                k_nope = ggml_cont(ctx0, k_nope);
-                Vcur = ggml_cont(ctx0, Vcur);
                 cb(Vcur, "mla_V", il);
 
                 // Concatenate k_nope + k_pe (broadcast k_pe to all heads)
@@ -403,11 +401,6 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_kimi_linear::build_kda_chunkin
         ggml_tensor * identity,
         ggml_tensor * diag_mask,
         int           il) {
-    GGML_ASSERT(ggml_is_contiguous(q));
-    GGML_ASSERT(ggml_is_contiguous(k));
-    GGML_ASSERT(ggml_is_contiguous(v));
-    GGML_ASSERT(ggml_is_contiguous(gk));
-    GGML_ASSERT(ggml_is_contiguous(beta));
     GGML_ASSERT(ggml_is_contiguous(state));
 
     const int64_t S_k      = q->ne[0];
@@ -694,12 +687,8 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_kimi_linear::build_kda_autoreg
     ggml_tensor * beta,
     ggml_tensor * state,
     int il) {
-    GGML_ASSERT(ggml_is_contiguous(q));
-    GGML_ASSERT(ggml_is_contiguous(k));
     GGML_ASSERT(ggml_is_contiguous(v));
     GGML_ASSERT(ggml_is_contiguous(gk));
-    GGML_ASSERT(ggml_is_contiguous(beta));
-    GGML_ASSERT(ggml_is_contiguous(state));
 
     const int64_t S_k      = q->ne[0];
     const int64_t H_k      = q->ne[1];

From 11282a0f6069e0e986d3faa84de285ccf5d73ce9 Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Sun, 1 Feb 2026 20:12:20 +0800
Subject: [PATCH 58/58] use is_mla to switch between different mem_hybrid types

---
 src/models/kimi-linear.cpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp
index 83349cc9ec..1e533fa51b 100644
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@@ -72,9 +72,11 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
     // Note: Kimi MLA does NOT use RoPE (rotary_emb=None in vLLM)
     // So we don't need inp_pos
 
-    auto * inp = build_inp_mem_hybrid_k();
-    auto * inp_rs = inp->get_recr();
-    auto * inp_attn = inp->get_attn();
+    auto * inp_kv = !hparams.is_mla() ? build_inp_mem_hybrid() : nullptr;
+    auto * inp_k = hparams.is_mla() ? build_inp_mem_hybrid_k() : nullptr;
+    auto * inp_rs = hparams.is_mla() ? inp_k->get_recr() : inp_kv->get_recr();
+    auto * inp_attn_kv = !hparams.is_mla() ? inp_kv->get_attn() : nullptr;
+    auto * inp_attn_k = hparams.is_mla() ? inp_k->get_attn() : nullptr;
 
     // Output ids for selecting which tokens to output
     ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -272,7 +274,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
                 ggml_tensor * Vcur = kv_cmpr;
                 cb(Vcur, "Vcur", il);
 
-                cur = build_attn(inp_attn, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, layer.wv_b, kq_scale_mla, il);
+                cur = build_attn(inp_attn_k, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, layer.wv_b, kq_scale_mla, il);
                 cb(cur, "mla_out", il);
             } else { // MLA KV cache disabled. Fall back to MHA KV cache.
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k_mla, n_head, n_tokens);
@@ -302,7 +304,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
 
                 // Direct softmax attention (with MHA KV cache)
                 // Use build_attn with inp_attn for proper mask handling
-                cur = build_attn(inp_attn, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il);
+                cur = build_attn(inp_attn_kv, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il);
                 cb(cur, "mla_out", il);
             }
         } else {