removed some hard coded code

2025-12-05 19:51:02 +08:00 · 2025-12-05 19:51:02 +08:00 · 9f1265fec1
parent 772ca88070
commit 9f1265fec1
3 changed files with 61 additions and 42 deletions
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -4990,7 +4990,9 @@ class KimiLinearModel(TextModel):
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
-        
+        self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"])
+        self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
+ 
        # Use find_hparam for context length
        # Kimi uses model_max_length
        n_ctx = self.find_hparam(["max_position_embeddings", "model_max_length", "n_ctx", "n_positions"], optional=True)
@ -5004,6 +5006,18 @@ class KimiLinearModel(TextModel):
        # KDA & MLA params
        # Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv
        linear_attn_config = self.hparams.get("linear_attn_config", {})
+        # n_head == 0 for KDA layers, n_head > 0 for MLA layers
+        # full_attention_layers list will be used to distingush layer type
+        _num_kv_heads = list()
+        _full_attn_layers = linear_attn_config["full_attn_layers"]
+        for il in range(self.hparams["num_hidden_layers"]):
+            if il+1 in _full_attn_layers:
+                _num_kv_heads.append(linear_attn_config["num_heads"])
+            else:
+                _num_kv_heads.append(0)
+        assert(len(_num_kv_heads) == self.hparams["num_hidden_layers"])
+        self.gguf_writer.add_head_count_kv(_num_kv_heads)
+
        ssm_d_conv = self.hparams.get("ssm_d_conv") or linear_attn_config.get("short_conv_kernel_size")
        if ssm_d_conv is not None:
             self.gguf_writer.add_ssm_conv_kernel(ssm_d_conv)
@ -5046,7 +5060,14 @@ class KimiLinearModel(TextModel):
             head_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
             self.gguf_writer.add_rope_dimension_count(head_dim)

-        self.gguf_writer.add_rope_freq_base(self.hparams.get("rope_theta", 10000.0))
+        # Copied from Qwen2Moe as this model inherits parts of it
+        # YaRN is not enabled by default
+        # To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
+        rope_scaling = self.hparams.get("rope_scaling") or {}
+        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
+            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
+            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
+            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])

        # MoE params
        n_experts = self.hparams.get("num_local_experts", self.hparams.get("num_experts"))
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -120,6 +120,7 @@ const char * llm_type_name(llm_type type) {
        case LLM_TYPE_16B_A1B:       return "16B.A1B";
        case LLM_TYPE_21B_A3B:       return "21B.A3B";
        case LLM_TYPE_30B_A3B:       return "30B.A3B";
+        case LLM_TYPE_48B_A3B:       return "48B.A3B";
        case LLM_TYPE_100B_A6B:      return "100B.A6B";
        case LLM_TYPE_106B_A12B:     return "106B.A12B";
        case LLM_TYPE_230B_A10B:     return "230B.A10B";
@ -2299,13 +2300,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                // qk_rope_head_dim = 64, qk_nope_head_dim = 128, qk_head_dim = 192

                // Mark KDA layers as recurrent using n_head_kv pattern (like Jamba)
-                // MLA layers are at: 3, 7, 11, 15, 19, 23, 26 (7 MLA layers total)
-                // KDA layers are all others: 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25 (20 KDA layers)
                // Set n_head_kv = 0 for KDA layers (recurrent), n_head_kv = n_head for MLA layers (attention)
                for (uint32_t i = 0; i < hparams.n_layer; ++i) {
-                    bool is_mla = (i == 3 || i == 7 || i == 11 || i == 15 || i == 19 || i == 23 || i == 26);
-                    hparams.n_head_kv_arr[i] = is_mla ? hparams.n_head() : 0;
-                    hparams.recurrent_layer_arr[i] = !is_mla;  // KDA layers are recurrent
+                    hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;  // KDA layers are recurrent
                }

                // MoE parameters - Kimi uses moe_intermediate_size = 1024
@ -2316,18 +2313,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func, false);

-                // Default values if not in GGUF
-                if (hparams.n_ff_exp == 0) hparams.n_ff_exp = 1024;  // moe_intermediate_size
-                if (hparams.n_ff_shexp == 0) hparams.n_ff_shexp = 9216;  // shared_expert_intermediate_size = intermediate_size
-                if (hparams.n_expert_shared == 0) hparams.n_expert_shared = 1;  // num_shared_experts
-                if (hparams.n_layer_dense_lead == 0) hparams.n_layer_dense_lead = 1;  // first_k_dense_replace
-                if (hparams.expert_weights_scale == 0.0f) hparams.expert_weights_scale = 2.446f;  // routed_scaling_factor
-
-                // MoE gating function - Kimi uses sigmoid (moe_router_activation_func: sigmoid)
-                if (hparams.expert_gating_func == 0) hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
-
                switch (hparams.n_layer) {
-                    case 27: type = LLM_TYPE_48B; break; // Kimi-Linear-48B-A3B
+                    case 27: type = LLM_TYPE_48B_A3B; break; // Kimi-Linear-48B-A3B
                    default: type = LLM_TYPE_UNKNOWN;
                }
            } break;
@ -7894,6 +7881,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        case LLM_ARCH_ARWKV7:
        case LLM_ARCH_WAVTOKENIZER_DEC:
        case LLM_ARCH_NEMOTRON_H:
+        case LLM_ARCH_KIMI_LINEAR:
            return LLAMA_ROPE_TYPE_NONE;

        // use what we call a normal RoPE, operating on pairs of consecutive head values
@ -7912,7 +7900,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        case LLM_ARCH_ARCTIC:
        case LLM_ARCH_DEEPSEEK:
        case LLM_ARCH_DEEPSEEK2:
-        case LLM_ARCH_KIMI_LINEAR:
        case LLM_ARCH_PLM:
        case LLM_ARCH_CHATGLM:
        case LLM_ARCH_GLM4:
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@ -339,6 +339,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
                ggml_row_size(kv->type, n_embd_head_qk_nope));
            k_nope = ggml_cont(ctx0, k_nope);
            Vcur = ggml_cont(ctx0, Vcur);
+            cb(Vcur, "mla_V", il);
            
            // Concatenate k_nope + k_pe (broadcast k_pe to all heads)
            // K = [k_nope, k_pe] where k_nope is [qk_nope_head_dim, n_head, n_tokens]
@ -349,12 +350,11 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
            ggml_tensor * k_pe_repeated = ggml_repeat(ctx0, k_pe, k_pe_target);
            ggml_tensor * Kcur = ggml_concat(ctx0, k_nope, k_pe_repeated, 0);
            cb(Kcur, "mla_K", il);
-            cb(Vcur, "mla_V", il);
            
            // Direct softmax attention (without KV cache)
            // Use build_attn with inp_no_cache for proper mask handling
-            cur = build_attn(inp_no_cache, layer.wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il);
-            cb(cur, "mla_out", il);
+            cur = build_attn(inp_no_cache, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il);
+//            cb(cur, "mla_out", il);
            
        } else {
            // Unknown layer type - this should not happen
@ -375,18 +375,33 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
        cur = build_norm(ffn_inp, layer.ffn_norm, NULL, LLM_NORM_RMS, il);
        cb(cur, "ffn_norm", il);

-        // FFN / MoE
-        if (layer.ffn_gate_inp) {
+        if ((uint32_t) il < hparams.n_layer_dense_lead) {
+            // Dense FFN layer
+            cur = build_ffn(cur,
+                layer.ffn_up, NULL, NULL,
+                layer.ffn_gate, NULL, NULL,
+                layer.ffn_down, NULL, NULL,
+                NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+            cb(cur, "ffn_out", il);
+        } else {
            // MoE layer
            // Kimi uses moe_renormalize=True and routed_scaling_factor (stored as expert_weights_scale) = 2.446
-            ggml_tensor * moe_out = build_moe_ffn(cur, layer.ffn_gate_inp, layer.ffn_up_exps, layer.ffn_gate_exps, layer.ffn_down_exps, 
-                                layer.ffn_exp_probs_b, hparams.n_expert, hparams.n_expert_used, 
-                                LLM_FFN_SILU, true, true, hparams.expert_weights_scale,
-                                (llama_expert_gating_func_type) hparams.expert_gating_func, il);
+            ggml_tensor * moe_out = build_moe_ffn(cur,
+                layer.ffn_gate_inp,
+                layer.ffn_up_exps,
+                layer.ffn_gate_exps,
+                layer.ffn_down_exps,
+                layer.ffn_exp_probs_b,
+                hparams.n_expert,
+                hparams.n_expert_used,
+                LLM_FFN_SILU, true,
+                true, hparams.expert_weights_scale,
+                (llama_expert_gating_func_type) hparams.expert_gating_func,
+                il);
            cb(moe_out, "ffn_moe_out", il);
            
-            // Shared expert (if present)
-            if (layer.ffn_gate_shexp) {
+            // Shared expert
+            {
                ggml_tensor * ffn_shexp = build_ffn(cur,
                        layer.ffn_up_shexp, NULL, NULL,
                        layer.ffn_gate_shexp, NULL, NULL,
@ -396,27 +411,23 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
                
                cur = ggml_add(ctx0, moe_out, ffn_shexp);
                cb(cur, "ffn_out", il);
-            } else {
-                cur = moe_out;
            }
-        } else if (layer.ffn_gate) {
-            // Dense FFN layer
-            cur = build_ffn(cur, layer.ffn_up, NULL, NULL, layer.ffn_gate, NULL, NULL, 
-                           layer.ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
-            cb(cur, "ffn_out", il);
-        } else {
-            // No FFN - this should not happen in Kimi
-            GGML_ABORT("Kimi layer missing FFN tensors");
        }
-
        // Residual
        cur = ggml_add(ctx0, cur, ffn_inp);
+
+        cur = build_cvec(cur, il);
+        cb(cur, "l_out", il);
+
        inpL = cur;
    }
+    cur = inpL;

    // Final Norm
-    cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1);
+    cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
    cb(cur, "result_norm", -1);
+    res->t_embd = cur;

    // Output
    cur = ggml_mul_mat(ctx0, model.output, cur);