diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 9c36c84189..45538fcabb 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -5021,6 +5021,13 @@ class KimiLinearModel(TextModel): ssm_d_conv = self.hparams.get("ssm_d_conv") or linear_attn_config.get("short_conv_kernel_size") if ssm_d_conv is not None: self.gguf_writer.add_ssm_conv_kernel(ssm_d_conv) + + kda_head_dim = self.hparams.get("kda_head_dim") or linear_attn_config.get("head_dim") + + if kda_head_dim is not None: + self.gguf_writer.add_kda_head_dim(kda_head_dim) + + # MLA params - use add_* methods that handle arch substitution # MLA params - use add_* methods that handle arch substitution # Support both HuggingFace naming (q_lora_rank, kv_lora_rank) and internal naming (n_lora_q, n_lora_kv) @@ -5035,8 +5042,9 @@ class KimiLinearModel(TextModel): # MLA head dimensions # Support HuggingFace naming: qk_nope_head_dim, qk_rope_head_dim, v_head_dim qk_nope_head_dim = self.hparams.get("qk_nope_head_dim") - qk_rope_head_dim = self.hparams.get("qk_rope_head_dim", self.hparams.get("n_rot")) + qk_rope_head_dim = self.hparams.get("qk_rope_head_dim") v_head_dim = self.hparams.get("v_head_dim") + self.gguf_writer.add_rope_dimension_count(self.hparams["qk_rope_head_dim"]) # Calculate n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim if "n_embd_head_k_mla" in self.hparams: diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 485c41abfb..fe9785918b 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -205,6 +205,9 @@ class Keys: GROUP_COUNT = "{arch}.ssm.group_count" DT_B_C_RMS = "{arch}.ssm.dt_b_c_rms" + class KDA: + HEAD_DIM = "{arch}.kda.head_dim" + class WKV: HEAD_SIZE = "{arch}.wkv.head_size" @@ -3475,6 +3478,9 @@ KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK KEY_SSM_GROUP_COUNT = Keys.SSM.GROUP_COUNT KEY_SSM_DT_B_C_RMS = Keys.SSM.DT_B_C_RMS +# KDA +KEY_KDA_HEAD_DIM = Keys.KDA.HEAD_DIM + # tokenization KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL KEY_TOKENIZER_PRE = Keys.Tokenizer.PRE diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 9e6ff3ac77..3b2dfef479 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -970,6 +970,9 @@ class GGUFWriter: def add_ssm_dt_b_c_rms(self, value: bool) -> None: self.add_bool(Keys.SSM.DT_B_C_RMS.format(arch=self.arch), value) + def add_kda_head_dim(self, value: int) -> None: + self.add_uint32(Keys.KDA.HEAD_DIM.format(arch=self.arch), value) + def add_tokenizer_model(self, model: str) -> None: self.add_string(Keys.Tokenizer.MODEL, model) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index ab09bb7eb7..6aabdb7f7d 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -236,6 +236,8 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_SSM_GROUP_COUNT, "%s.ssm.group_count" }, { LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" }, + { LLM_KV_KDA_HEAD_DIM, "%s.kda.head_dim" }, + { LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" }, { LLM_KV_POSNET_EMBEDDING_LENGTH, "%s.posnet.embedding_length" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index 2b965850c5..d68af214a7 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -240,6 +240,8 @@ enum llm_kv { LLM_KV_SSM_GROUP_COUNT, LLM_KV_SSM_DT_B_C_RMS, + LLM_KV_KDA_HEAD_DIM, + LLM_KV_WKV_HEAD_SIZE, LLM_KV_TOKENIZER_MODEL, diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 88d266b8da..75ddeeba09 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -137,7 +137,7 @@ uint32_t llama_hparams::n_embd_r() const { // for Kimi KDA layers // Conv state for Q, K, V: 3 * (d_conv - 1) * n_head * head_dim const uint32_t d_inner = n_head() * kda_head_dim; // 32 * 128 = 4096 - return 3 * (kda_d_conv > 0 ? kda_d_conv - 1 : 3) * d_inner; + return 3 * (ssm_d_conv > 0 ? ssm_d_conv - 1 : 3) * d_inner; } // TODO: maybe support other convolution strides than 1 diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 80170650eb..c90ed12b90 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -133,9 +133,8 @@ struct llama_hparams { uint32_t ssm_dt_rank = 0; uint32_t ssm_n_group = 0; - // for Kimi Delta Attention (KDA) - uint32_t kda_head_dim = 0; // head_dim for KDA layers (128 for Kimi) - uint32_t kda_d_conv = 0; // conv kernel size for KDA (4 for Kimi) + // for Kimi Linear KDA + uint32_t kda_head_dim = 0; // for hybrid state space models std::array recurrent_layer_arr; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 0f162cdd7a..2e3cb9d78c 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2291,10 +2291,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false); ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv, false); ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false); - - // KDA (Delta Attention) parameters - hparams.kda_head_dim = 128; // linear_attn_config.head_dim - hparams.kda_d_conv = 4; // linear_attn_config.short_conv_kernel_size + ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv, false); + ml.get_key(LLM_KV_KDA_HEAD_DIM, hparams.kda_head_dim, false); // MLA qk_rope_head_dim (for reference) // qk_rope_head_dim = 64, qk_nope_head_dim = 128, qk_head_dim = 192 @@ -6447,9 +6445,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // Assuming KDA layer if KDA tensors are present // KDA uses head_dim = 128 (from linear_attn_config.head_dim) - const int64_t n_embd_head_k_kda = 128; - const int64_t n_embd_head_v_kda = 128; - const int64_t ssm_d_conv = hparams.ssm_d_conv > 0 ? hparams.ssm_d_conv : 4; + const int64_t n_embd_head_k_kda = hparams.kda_head_dim; + const int64_t n_embd_head_v_kda = hparams.kda_head_dim; + const int64_t ssm_d_conv = hparams.ssm_d_conv; // Try loading KDA specific tensors (using SSM_ prefix) // Conv1d weights: try 4D first, then 3D (quantization may remove trailing 1) @@ -6513,8 +6511,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // MLA Layer - use MLA-specific head dimensions const int64_t q_lora_rank = hparams.n_lora_q; const int64_t kv_lora_rank = hparams.n_lora_kv; - const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla > 0 ? hparams.n_embd_head_k_mla : 192; - const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla > 0 ? hparams.n_embd_head_v_mla : 128; + const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla; + const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla; layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, TENSOR_NOT_REQUIRED); layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0); @@ -6529,7 +6527,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // Kimi: qk_rope_head_dim = 64 (actual RoPE dimension for MLA) // Note: hparams.n_rot may be 72 (from conversion) but actual is 64 - const int64_t qk_rope_head_dim = 64; // From config: qk_rope_head_dim + const int64_t qk_rope_head_dim = hparams.n_rot; // From config: qk_rope_head_dim layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + qk_rope_head_dim}, 0); layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_k_mla - qk_rope_head_dim + n_embd_head_v_mla)}, 0); @@ -6539,7 +6537,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); // MoE intermediate size (different from dense FFN) - const int64_t n_ff_exp = hparams.n_ff_exp > 0 ? hparams.n_ff_exp : 1024; + const int64_t n_ff_exp = hparams.n_ff_exp; // Kimi uses n_layer_dense_lead to determine which layers use dense FFN vs MoE // first_k_dense_replace = 1 means layer 0 uses dense FFN, layers 1+ use MoE diff --git a/src/llama-model.h b/src/llama-model.h index b067b686d2..7081423588 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -84,7 +84,6 @@ enum llm_type { LLM_TYPE_35B, LLM_TYPE_36B, LLM_TYPE_40B, - LLM_TYPE_48B, LLM_TYPE_65B, LLM_TYPE_70B, LLM_TYPE_120B, @@ -114,6 +113,7 @@ enum llm_type { LLM_TYPE_16B_A1B, LLM_TYPE_21B_A3B, // Ernie MoE small LLM_TYPE_30B_A3B, + LLM_TYPE_48B_A3B, // Kimi Linear LLM_TYPE_80B_A3B, // Qwen3 Next LLM_TYPE_100B_A6B, LLM_TYPE_106B_A12B, // GLM-4.5-Air diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index 40fbe469b3..d025eab5f3 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -21,8 +21,8 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll // Kimi dimension constants const int64_t n_head = hparams.n_head(); - const int64_t head_dim = hparams.kda_head_dim > 0 ? hparams.kda_head_dim : 128; - const int64_t d_conv = hparams.kda_d_conv > 0 ? hparams.kda_d_conv : 4; + const int64_t head_dim = hparams.kda_head_dim; + const int64_t d_conv = hparams.ssm_d_conv; const int64_t d_inner = n_head * head_dim; // 32 * 128 = 4096 const int64_t n_seqs = ubatch.n_seqs; const int64_t n_seq_tokens = ubatch.n_seq_tokens; @@ -33,12 +33,12 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); // MLA params - const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla > 0 ? hparams.n_embd_head_k_mla : 192; - const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla > 0 ? hparams.n_embd_head_v_mla : 128; - const int64_t kv_lora_rank = hparams.n_lora_kv > 0 ? hparams.n_lora_kv : 512; - // qk_rope_head_dim = 64 (from Kimi config), NOT hparams.n_rot (which is 72) + const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla; + const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla; + const int64_t kv_lora_rank = hparams.n_lora_kv; + // qk_rope_head_dim = 64 (from Kimi config) which is hparams.n_rot // Confirmed from tensor shape: wkv_a_mqa [2304, 576] = [n_embd, kv_lora_rank + qk_rope_head_dim] - const int64_t n_embd_head_qk_rope = 64; // config.qk_rope_head_dim + const int64_t n_embd_head_qk_rope = hparams.n_rot; // config.qk_rope_head_dim const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope; // 192 - 64 = 128 // Attention scale for KDA (1/sqrt(head_dim))