From 9f1265fec16598cc9c24ae31ae38c3ae7aaa3bde Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Fri, 5 Dec 2025 19:51:02 +0800
Subject: [PATCH] removed some hard coded code

---
 convert_hf_to_gguf.py      | 25 +++++++++++++++--
 src/llama-model.cpp        | 21 +++-----------
 src/models/kimi-linear.cpp | 57 +++++++++++++++++++++++---------------
 3 files changed, 61 insertions(+), 42 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 2808b72d76..9c36c84189 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -4990,7 +4990,9 @@ class KimiLinearModel(TextModel):
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
         self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
-        
+        self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"])
+        self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
+ 
         # Use find_hparam for context length
         # Kimi uses model_max_length
         n_ctx = self.find_hparam(["max_position_embeddings", "model_max_length", "n_ctx", "n_positions"], optional=True)
@@ -5004,6 +5006,18 @@ class KimiLinearModel(TextModel):
         # KDA & MLA params
         # Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv
         linear_attn_config = self.hparams.get("linear_attn_config", {})
+        # n_head == 0 for KDA layers, n_head > 0 for MLA layers
+        # full_attention_layers list will be used to distingush layer type
+        _num_kv_heads = list()
+        _full_attn_layers = linear_attn_config["full_attn_layers"]
+        for il in range(self.hparams["num_hidden_layers"]):
+            if il+1 in _full_attn_layers:
+                _num_kv_heads.append(linear_attn_config["num_heads"])
+            else:
+                _num_kv_heads.append(0)
+        assert(len(_num_kv_heads) == self.hparams["num_hidden_layers"])
+        self.gguf_writer.add_head_count_kv(_num_kv_heads)
+
         ssm_d_conv = self.hparams.get("ssm_d_conv") or linear_attn_config.get("short_conv_kernel_size")
         if ssm_d_conv is not None:
              self.gguf_writer.add_ssm_conv_kernel(ssm_d_conv)
@@ -5046,7 +5060,14 @@ class KimiLinearModel(TextModel):
              head_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
              self.gguf_writer.add_rope_dimension_count(head_dim)
 
-        self.gguf_writer.add_rope_freq_base(self.hparams.get("rope_theta", 10000.0))
+        # Copied from Qwen2Moe as this model inherits parts of it
+        # YaRN is not enabled by default
+        # To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
+        rope_scaling = self.hparams.get("rope_scaling") or {}
+        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
+            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
+            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
+            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
 
         # MoE params
         n_experts = self.hparams.get("num_local_experts", self.hparams.get("num_experts"))
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 763f0dfecb..0f162cdd7a 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -120,6 +120,7 @@ const char * llm_type_name(llm_type type) {
         case LLM_TYPE_16B_A1B:       return "16B.A1B";
         case LLM_TYPE_21B_A3B:       return "21B.A3B";
         case LLM_TYPE_30B_A3B:       return "30B.A3B";
+        case LLM_TYPE_48B_A3B:       return "48B.A3B";
         case LLM_TYPE_100B_A6B:      return "100B.A6B";
         case LLM_TYPE_106B_A12B:     return "106B.A12B";
         case LLM_TYPE_230B_A10B:     return "230B.A10B";
@@ -2299,13 +2300,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 // qk_rope_head_dim = 64, qk_nope_head_dim = 128, qk_head_dim = 192
 
                 // Mark KDA layers as recurrent using n_head_kv pattern (like Jamba)
-                // MLA layers are at: 3, 7, 11, 15, 19, 23, 26 (7 MLA layers total)
-                // KDA layers are all others: 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25 (20 KDA layers)
                 // Set n_head_kv = 0 for KDA layers (recurrent), n_head_kv = n_head for MLA layers (attention)
                 for (uint32_t i = 0; i < hparams.n_layer; ++i) {
-                    bool is_mla = (i == 3 || i == 7 || i == 11 || i == 15 || i == 19 || i == 23 || i == 26);
-                    hparams.n_head_kv_arr[i] = is_mla ? hparams.n_head() : 0;
-                    hparams.recurrent_layer_arr[i] = !is_mla;  // KDA layers are recurrent
+                    hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;  // KDA layers are recurrent
                 }
 
                 // MoE parameters - Kimi uses moe_intermediate_size = 1024
@@ -2316,18 +2313,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
                 ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func, false);
 
-                // Default values if not in GGUF
-                if (hparams.n_ff_exp == 0) hparams.n_ff_exp = 1024;  // moe_intermediate_size
-                if (hparams.n_ff_shexp == 0) hparams.n_ff_shexp = 9216;  // shared_expert_intermediate_size = intermediate_size
-                if (hparams.n_expert_shared == 0) hparams.n_expert_shared = 1;  // num_shared_experts
-                if (hparams.n_layer_dense_lead == 0) hparams.n_layer_dense_lead = 1;  // first_k_dense_replace
-                if (hparams.expert_weights_scale == 0.0f) hparams.expert_weights_scale = 2.446f;  // routed_scaling_factor
-
-                // MoE gating function - Kimi uses sigmoid (moe_router_activation_func: sigmoid)
-                if (hparams.expert_gating_func == 0) hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
-
                 switch (hparams.n_layer) {
-                    case 27: type = LLM_TYPE_48B; break; // Kimi-Linear-48B-A3B
+                    case 27: type = LLM_TYPE_48B_A3B; break; // Kimi-Linear-48B-A3B
                     default: type = LLM_TYPE_UNKNOWN;
                 }
             } break;
@@ -7894,6 +7881,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_ARWKV7:
         case LLM_ARCH_WAVTOKENIZER_DEC:
         case LLM_ARCH_NEMOTRON_H:
+        case LLM_ARCH_KIMI_LINEAR:
             return LLAMA_ROPE_TYPE_NONE;
 
         // use what we call a normal RoPE, operating on pairs of consecutive head values
@@ -7912,7 +7900,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_ARCTIC:
         case LLM_ARCH_DEEPSEEK:
         case LLM_ARCH_DEEPSEEK2:
-        case LLM_ARCH_KIMI_LINEAR:
         case LLM_ARCH_PLM:
         case LLM_ARCH_CHATGLM:
         case LLM_ARCH_GLM4:
diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp
index 660cd06f0e..40fbe469b3 100644
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@@ -339,6 +339,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
                 ggml_row_size(kv->type, n_embd_head_qk_nope));
             k_nope = ggml_cont(ctx0, k_nope);
             Vcur = ggml_cont(ctx0, Vcur);
+            cb(Vcur, "mla_V", il);
             
             // Concatenate k_nope + k_pe (broadcast k_pe to all heads)
             // K = [k_nope, k_pe] where k_nope is [qk_nope_head_dim, n_head, n_tokens]
@@ -349,12 +350,11 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
             ggml_tensor * k_pe_repeated = ggml_repeat(ctx0, k_pe, k_pe_target);
             ggml_tensor * Kcur = ggml_concat(ctx0, k_nope, k_pe_repeated, 0);
             cb(Kcur, "mla_K", il);
-            cb(Vcur, "mla_V", il);
             
             // Direct softmax attention (without KV cache)
             // Use build_attn with inp_no_cache for proper mask handling
-            cur = build_attn(inp_no_cache, layer.wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il);
-            cb(cur, "mla_out", il);
+            cur = build_attn(inp_no_cache, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il);
+//            cb(cur, "mla_out", il);
             
         } else {
             // Unknown layer type - this should not happen
@@ -375,18 +375,33 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
         cur = build_norm(ffn_inp, layer.ffn_norm, NULL, LLM_NORM_RMS, il);
         cb(cur, "ffn_norm", il);
 
-        // FFN / MoE
-        if (layer.ffn_gate_inp) {
+        if ((uint32_t) il < hparams.n_layer_dense_lead) {
+            // Dense FFN layer
+            cur = build_ffn(cur,
+                layer.ffn_up, NULL, NULL,
+                layer.ffn_gate, NULL, NULL,
+                layer.ffn_down, NULL, NULL,
+                NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+            cb(cur, "ffn_out", il);
+        } else {
             // MoE layer
             // Kimi uses moe_renormalize=True and routed_scaling_factor (stored as expert_weights_scale) = 2.446
-            ggml_tensor * moe_out = build_moe_ffn(cur, layer.ffn_gate_inp, layer.ffn_up_exps, layer.ffn_gate_exps, layer.ffn_down_exps, 
-                                layer.ffn_exp_probs_b, hparams.n_expert, hparams.n_expert_used, 
-                                LLM_FFN_SILU, true, true, hparams.expert_weights_scale,
-                                (llama_expert_gating_func_type) hparams.expert_gating_func, il);
+            ggml_tensor * moe_out = build_moe_ffn(cur,
+                layer.ffn_gate_inp,
+                layer.ffn_up_exps,
+                layer.ffn_gate_exps,
+                layer.ffn_down_exps,
+                layer.ffn_exp_probs_b,
+                hparams.n_expert,
+                hparams.n_expert_used,
+                LLM_FFN_SILU, true,
+                true, hparams.expert_weights_scale,
+                (llama_expert_gating_func_type) hparams.expert_gating_func,
+                il);
             cb(moe_out, "ffn_moe_out", il);
             
-            // Shared expert (if present)
-            if (layer.ffn_gate_shexp) {
+            // Shared expert
+            {
                 ggml_tensor * ffn_shexp = build_ffn(cur,
                         layer.ffn_up_shexp, NULL, NULL,
                         layer.ffn_gate_shexp, NULL, NULL,
@@ -396,27 +411,23 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
                 
                 cur = ggml_add(ctx0, moe_out, ffn_shexp);
                 cb(cur, "ffn_out", il);
-            } else {
-                cur = moe_out;
             }
-        } else if (layer.ffn_gate) {
-            // Dense FFN layer
-            cur = build_ffn(cur, layer.ffn_up, NULL, NULL, layer.ffn_gate, NULL, NULL, 
-                           layer.ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
-            cb(cur, "ffn_out", il);
-        } else {
-            // No FFN - this should not happen in Kimi
-            GGML_ABORT("Kimi layer missing FFN tensors");
         }
-
         // Residual
         cur = ggml_add(ctx0, cur, ffn_inp);
+
+        cur = build_cvec(cur, il);
+        cb(cur, "l_out", il);
+
         inpL = cur;
     }
+    cur = inpL;
 
     // Final Norm
-    cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1);
+    cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
     cb(cur, "result_norm", -1);
+    res->t_embd = cur;
 
     // Output
     cur = ggml_mul_mat(ctx0, model.output, cur);