removed all hard code

2025-12-06 11:51:16 +08:00 · 2025-12-06 11:51:16 +08:00 · a0269af292
parent 9f1265fec1
commit a0269af292
10 changed files with 42 additions and 24 deletions
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -5021,6 +5021,13 @@ class KimiLinearModel(TextModel):
        ssm_d_conv = self.hparams.get("ssm_d_conv") or linear_attn_config.get("short_conv_kernel_size")
        if ssm_d_conv is not None:
             self.gguf_writer.add_ssm_conv_kernel(ssm_d_conv)
+
+        kda_head_dim = self.hparams.get("kda_head_dim") or linear_attn_config.get("head_dim")
+
+        if kda_head_dim is not None:
+             self.gguf_writer.add_kda_head_dim(kda_head_dim)
+        
+        # MLA params - use add_* methods that handle arch substitution
        
        # MLA params - use add_* methods that handle arch substitution
        # Support both HuggingFace naming (q_lora_rank, kv_lora_rank) and internal naming (n_lora_q, n_lora_kv)
@ -5035,8 +5042,9 @@ class KimiLinearModel(TextModel):
        # MLA head dimensions
        # Support HuggingFace naming: qk_nope_head_dim, qk_rope_head_dim, v_head_dim
        qk_nope_head_dim = self.hparams.get("qk_nope_head_dim")
-        qk_rope_head_dim = self.hparams.get("qk_rope_head_dim", self.hparams.get("n_rot"))
+        qk_rope_head_dim = self.hparams.get("qk_rope_head_dim")
        v_head_dim = self.hparams.get("v_head_dim")
+        self.gguf_writer.add_rope_dimension_count(self.hparams["qk_rope_head_dim"])
        
        # Calculate n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim
        if "n_embd_head_k_mla" in self.hparams:
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -205,6 +205,9 @@ class Keys:
        GROUP_COUNT    = "{arch}.ssm.group_count"
        DT_B_C_RMS     = "{arch}.ssm.dt_b_c_rms"

+    class KDA:
+        HEAD_DIM = "{arch}.kda.head_dim"
+
    class WKV:
        HEAD_SIZE = "{arch}.wkv.head_size"

@ -3475,6 +3478,9 @@ KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK
 KEY_SSM_GROUP_COUNT    = Keys.SSM.GROUP_COUNT
 KEY_SSM_DT_B_C_RMS     = Keys.SSM.DT_B_C_RMS

+# KDA
+KEY_KDA_HEAD_DIM       = Keys.KDA.HEAD_DIM
+
 # tokenization
 KEY_TOKENIZER_MODEL      = Keys.Tokenizer.MODEL
 KEY_TOKENIZER_PRE        = Keys.Tokenizer.PRE
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@ -970,6 +970,9 @@ class GGUFWriter:
    def add_ssm_dt_b_c_rms(self, value: bool) -> None:
        self.add_bool(Keys.SSM.DT_B_C_RMS.format(arch=self.arch), value)

+    def add_kda_head_dim(self, value: int) -> None:
+        self.add_uint32(Keys.KDA.HEAD_DIM.format(arch=self.arch), value)
+
    def add_tokenizer_model(self, model: str) -> None:
        self.add_string(Keys.Tokenizer.MODEL, model)

--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@ -236,6 +236,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_SSM_GROUP_COUNT,    "%s.ssm.group_count"    },
    { LLM_KV_SSM_DT_B_C_RMS,     "%s.ssm.dt_b_c_rms"     },

+    { LLM_KV_KDA_HEAD_DIM, "%s.kda.head_dim" },
+
    { LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" },

    { LLM_KV_POSNET_EMBEDDING_LENGTH, "%s.posnet.embedding_length" },
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@ -240,6 +240,8 @@ enum llm_kv {
    LLM_KV_SSM_GROUP_COUNT,
    LLM_KV_SSM_DT_B_C_RMS,

+    LLM_KV_KDA_HEAD_DIM,
+
    LLM_KV_WKV_HEAD_SIZE,

    LLM_KV_TOKENIZER_MODEL,
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@ -137,7 +137,7 @@ uint32_t llama_hparams::n_embd_r() const {
        // for Kimi KDA layers
        // Conv state for Q, K, V: 3 * (d_conv - 1) * n_head * head_dim
        const uint32_t d_inner = n_head() * kda_head_dim;  // 32 * 128 = 4096
-        return 3 * (kda_d_conv > 0 ? kda_d_conv - 1 : 3) * d_inner;
+        return 3 * (ssm_d_conv > 0 ? ssm_d_conv - 1 : 3) * d_inner;
    }

    // TODO: maybe support other convolution strides than 1
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@ -133,9 +133,8 @@ struct llama_hparams {
    uint32_t ssm_dt_rank = 0;
    uint32_t ssm_n_group = 0;

-    // for Kimi Delta Attention (KDA)
-    uint32_t kda_head_dim  = 0;  // head_dim for KDA layers (128 for Kimi)
-    uint32_t kda_d_conv    = 0;  // conv kernel size for KDA (4 for Kimi)
+    // for Kimi Linear KDA
+    uint32_t kda_head_dim = 0;

    // for hybrid state space models
    std::array<bool, LLAMA_MAX_LAYERS> recurrent_layer_arr;
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -2291,10 +2291,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA,  hparams.n_embd_head_v_mla, false);
                ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK,      hparams.n_lora_kv, false);
                ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT,        hparams.n_rot, false);
-
-                // KDA (Delta Attention) parameters
-                hparams.kda_head_dim = 128;  // linear_attn_config.head_dim
-                hparams.kda_d_conv = 4;      // linear_attn_config.short_conv_kernel_size
+                ml.get_key(LLM_KV_SSM_CONV_KERNEL,             hparams.ssm_d_conv, false);
+                ml.get_key(LLM_KV_KDA_HEAD_DIM,                hparams.kda_head_dim, false);

                // MLA qk_rope_head_dim (for reference)
                // qk_rope_head_dim = 64, qk_nope_head_dim = 128, qk_head_dim = 192
@ -6447,9 +6445,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        // Assuming KDA layer if KDA tensors are present

                        // KDA uses head_dim = 128 (from linear_attn_config.head_dim)
-                        const int64_t n_embd_head_k_kda = 128;
-                        const int64_t n_embd_head_v_kda = 128;
-                        const int64_t ssm_d_conv = hparams.ssm_d_conv > 0 ? hparams.ssm_d_conv : 4;
+                        const int64_t n_embd_head_k_kda = hparams.kda_head_dim;
+                        const int64_t n_embd_head_v_kda = hparams.kda_head_dim;
+                        const int64_t ssm_d_conv = hparams.ssm_d_conv;

                        // Try loading KDA specific tensors (using SSM_ prefix)
                        // Conv1d weights: try 4D first, then 3D (quantization may remove trailing 1)
@ -6513,8 +6511,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                             // MLA Layer - use MLA-specific head dimensions
                             const int64_t q_lora_rank  = hparams.n_lora_q;
                             const int64_t kv_lora_rank = hparams.n_lora_kv;
-                             const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla > 0 ? hparams.n_embd_head_k_mla : 192;
-                             const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla > 0 ? hparams.n_embd_head_v_mla : 128;
+                             const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla;
+                             const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla;

                             layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, TENSOR_NOT_REQUIRED);
                             layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
@ -6529,7 +6527,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {

                             // Kimi: qk_rope_head_dim = 64 (actual RoPE dimension for MLA)
                             // Note: hparams.n_rot may be 72 (from conversion) but actual is 64
-                             const int64_t qk_rope_head_dim = 64;  // From config: qk_rope_head_dim
+                             const int64_t qk_rope_head_dim = hparams.n_rot;  // From config: qk_rope_head_dim
                             layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + qk_rope_head_dim}, 0);
                             layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_k_mla - qk_rope_head_dim + n_embd_head_v_mla)}, 0);

@ -6539,7 +6537,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);

                        // MoE intermediate size (different from dense FFN)
-                        const int64_t n_ff_exp = hparams.n_ff_exp > 0 ? hparams.n_ff_exp : 1024;
+                        const int64_t n_ff_exp = hparams.n_ff_exp;

                        // Kimi uses n_layer_dense_lead to determine which layers use dense FFN vs MoE
                        // first_k_dense_replace = 1 means layer 0 uses dense FFN, layers 1+ use MoE
--- a/src/llama-model.h
+++ b/src/llama-model.h
@ -84,7 +84,6 @@ enum llm_type {
    LLM_TYPE_35B,
    LLM_TYPE_36B,
    LLM_TYPE_40B,
-    LLM_TYPE_48B,
    LLM_TYPE_65B,
    LLM_TYPE_70B,
    LLM_TYPE_120B,
@ -114,6 +113,7 @@ enum llm_type {
    LLM_TYPE_16B_A1B,
    LLM_TYPE_21B_A3B, // Ernie MoE small
    LLM_TYPE_30B_A3B,
+    LLM_TYPE_48B_A3B, // Kimi Linear
    LLM_TYPE_80B_A3B, // Qwen3 Next
    LLM_TYPE_100B_A6B,
    LLM_TYPE_106B_A12B, // GLM-4.5-Air
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@ -21,8 +21,8 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll

    // Kimi dimension constants
    const int64_t n_head = hparams.n_head();
-    const int64_t head_dim = hparams.kda_head_dim > 0 ? hparams.kda_head_dim : 128;
-    const int64_t d_conv = hparams.kda_d_conv > 0 ? hparams.kda_d_conv : 4;
+    const int64_t head_dim = hparams.kda_head_dim;
+    const int64_t d_conv = hparams.ssm_d_conv;
    const int64_t d_inner = n_head * head_dim;  // 32 * 128 = 4096
    const int64_t n_seqs = ubatch.n_seqs;
    const int64_t n_seq_tokens = ubatch.n_seq_tokens;
@ -33,12 +33,12 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
    GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
    
    // MLA params
-    const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla > 0 ? hparams.n_embd_head_k_mla : 192;
-    const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla > 0 ? hparams.n_embd_head_v_mla : 128;
-    const int64_t kv_lora_rank = hparams.n_lora_kv > 0 ? hparams.n_lora_kv : 512;
-    // qk_rope_head_dim = 64 (from Kimi config), NOT hparams.n_rot (which is 72)
+    const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla;
+    const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla;
+    const int64_t kv_lora_rank = hparams.n_lora_kv;
+    // qk_rope_head_dim = 64 (from Kimi config) which is hparams.n_rot
    // Confirmed from tensor shape: wkv_a_mqa [2304, 576] = [n_embd, kv_lora_rank + qk_rope_head_dim]
-    const int64_t n_embd_head_qk_rope = 64;  // config.qk_rope_head_dim
+    const int64_t n_embd_head_qk_rope = hparams.n_rot;  // config.qk_rope_head_dim
    const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;  // 192 - 64 = 128
    
    // Attention scale for KDA (1/sqrt(head_dim))