removed all hard code

This commit is contained in:
Yee Man Chan 2025-12-06 11:51:16 +08:00
parent 9f1265fec1
commit a0269af292
10 changed files with 42 additions and 24 deletions

View File

@ -5021,6 +5021,13 @@ class KimiLinearModel(TextModel):
ssm_d_conv = self.hparams.get("ssm_d_conv") or linear_attn_config.get("short_conv_kernel_size")
if ssm_d_conv is not None:
self.gguf_writer.add_ssm_conv_kernel(ssm_d_conv)
kda_head_dim = self.hparams.get("kda_head_dim") or linear_attn_config.get("head_dim")
if kda_head_dim is not None:
self.gguf_writer.add_kda_head_dim(kda_head_dim)
# MLA params - use add_* methods that handle arch substitution
# MLA params - use add_* methods that handle arch substitution
# Support both HuggingFace naming (q_lora_rank, kv_lora_rank) and internal naming (n_lora_q, n_lora_kv)
@ -5035,8 +5042,9 @@ class KimiLinearModel(TextModel):
# MLA head dimensions
# Support HuggingFace naming: qk_nope_head_dim, qk_rope_head_dim, v_head_dim
qk_nope_head_dim = self.hparams.get("qk_nope_head_dim")
qk_rope_head_dim = self.hparams.get("qk_rope_head_dim", self.hparams.get("n_rot"))
qk_rope_head_dim = self.hparams.get("qk_rope_head_dim")
v_head_dim = self.hparams.get("v_head_dim")
self.gguf_writer.add_rope_dimension_count(self.hparams["qk_rope_head_dim"])
# Calculate n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim
if "n_embd_head_k_mla" in self.hparams:

View File

@ -205,6 +205,9 @@ class Keys:
GROUP_COUNT = "{arch}.ssm.group_count"
DT_B_C_RMS = "{arch}.ssm.dt_b_c_rms"
class KDA:
HEAD_DIM = "{arch}.kda.head_dim"
class WKV:
HEAD_SIZE = "{arch}.wkv.head_size"
@ -3475,6 +3478,9 @@ KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK
KEY_SSM_GROUP_COUNT = Keys.SSM.GROUP_COUNT
KEY_SSM_DT_B_C_RMS = Keys.SSM.DT_B_C_RMS
# KDA
KEY_KDA_HEAD_DIM = Keys.KDA.HEAD_DIM
# tokenization
KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL
KEY_TOKENIZER_PRE = Keys.Tokenizer.PRE

View File

@ -970,6 +970,9 @@ class GGUFWriter:
def add_ssm_dt_b_c_rms(self, value: bool) -> None:
self.add_bool(Keys.SSM.DT_B_C_RMS.format(arch=self.arch), value)
def add_kda_head_dim(self, value: int) -> None:
self.add_uint32(Keys.KDA.HEAD_DIM.format(arch=self.arch), value)
def add_tokenizer_model(self, model: str) -> None:
self.add_string(Keys.Tokenizer.MODEL, model)

View File

@ -236,6 +236,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_SSM_GROUP_COUNT, "%s.ssm.group_count" },
{ LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" },
{ LLM_KV_KDA_HEAD_DIM, "%s.kda.head_dim" },
{ LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" },
{ LLM_KV_POSNET_EMBEDDING_LENGTH, "%s.posnet.embedding_length" },

View File

@ -240,6 +240,8 @@ enum llm_kv {
LLM_KV_SSM_GROUP_COUNT,
LLM_KV_SSM_DT_B_C_RMS,
LLM_KV_KDA_HEAD_DIM,
LLM_KV_WKV_HEAD_SIZE,
LLM_KV_TOKENIZER_MODEL,

View File

@ -137,7 +137,7 @@ uint32_t llama_hparams::n_embd_r() const {
// for Kimi KDA layers
// Conv state for Q, K, V: 3 * (d_conv - 1) * n_head * head_dim
const uint32_t d_inner = n_head() * kda_head_dim; // 32 * 128 = 4096
return 3 * (kda_d_conv > 0 ? kda_d_conv - 1 : 3) * d_inner;
return 3 * (ssm_d_conv > 0 ? ssm_d_conv - 1 : 3) * d_inner;
}
// TODO: maybe support other convolution strides than 1

View File

@ -133,9 +133,8 @@ struct llama_hparams {
uint32_t ssm_dt_rank = 0;
uint32_t ssm_n_group = 0;
// for Kimi Delta Attention (KDA)
uint32_t kda_head_dim = 0; // head_dim for KDA layers (128 for Kimi)
uint32_t kda_d_conv = 0; // conv kernel size for KDA (4 for Kimi)
// for Kimi Linear KDA
uint32_t kda_head_dim = 0;
// for hybrid state space models
std::array<bool, LLAMA_MAX_LAYERS> recurrent_layer_arr;

View File

@ -2291,10 +2291,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv, false);
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
// KDA (Delta Attention) parameters
hparams.kda_head_dim = 128; // linear_attn_config.head_dim
hparams.kda_d_conv = 4; // linear_attn_config.short_conv_kernel_size
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv, false);
ml.get_key(LLM_KV_KDA_HEAD_DIM, hparams.kda_head_dim, false);
// MLA qk_rope_head_dim (for reference)
// qk_rope_head_dim = 64, qk_nope_head_dim = 128, qk_head_dim = 192
@ -6447,9 +6445,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
// Assuming KDA layer if KDA tensors are present
// KDA uses head_dim = 128 (from linear_attn_config.head_dim)
const int64_t n_embd_head_k_kda = 128;
const int64_t n_embd_head_v_kda = 128;
const int64_t ssm_d_conv = hparams.ssm_d_conv > 0 ? hparams.ssm_d_conv : 4;
const int64_t n_embd_head_k_kda = hparams.kda_head_dim;
const int64_t n_embd_head_v_kda = hparams.kda_head_dim;
const int64_t ssm_d_conv = hparams.ssm_d_conv;
// Try loading KDA specific tensors (using SSM_ prefix)
// Conv1d weights: try 4D first, then 3D (quantization may remove trailing 1)
@ -6513,8 +6511,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
// MLA Layer - use MLA-specific head dimensions
const int64_t q_lora_rank = hparams.n_lora_q;
const int64_t kv_lora_rank = hparams.n_lora_kv;
const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla > 0 ? hparams.n_embd_head_k_mla : 192;
const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla > 0 ? hparams.n_embd_head_v_mla : 128;
const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla;
const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla;
layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, TENSOR_NOT_REQUIRED);
layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
@ -6529,7 +6527,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
// Kimi: qk_rope_head_dim = 64 (actual RoPE dimension for MLA)
// Note: hparams.n_rot may be 72 (from conversion) but actual is 64
const int64_t qk_rope_head_dim = 64; // From config: qk_rope_head_dim
const int64_t qk_rope_head_dim = hparams.n_rot; // From config: qk_rope_head_dim
layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + qk_rope_head_dim}, 0);
layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_k_mla - qk_rope_head_dim + n_embd_head_v_mla)}, 0);
@ -6539,7 +6537,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
// MoE intermediate size (different from dense FFN)
const int64_t n_ff_exp = hparams.n_ff_exp > 0 ? hparams.n_ff_exp : 1024;
const int64_t n_ff_exp = hparams.n_ff_exp;
// Kimi uses n_layer_dense_lead to determine which layers use dense FFN vs MoE
// first_k_dense_replace = 1 means layer 0 uses dense FFN, layers 1+ use MoE

View File

@ -84,7 +84,6 @@ enum llm_type {
LLM_TYPE_35B,
LLM_TYPE_36B,
LLM_TYPE_40B,
LLM_TYPE_48B,
LLM_TYPE_65B,
LLM_TYPE_70B,
LLM_TYPE_120B,
@ -114,6 +113,7 @@ enum llm_type {
LLM_TYPE_16B_A1B,
LLM_TYPE_21B_A3B, // Ernie MoE small
LLM_TYPE_30B_A3B,
LLM_TYPE_48B_A3B, // Kimi Linear
LLM_TYPE_80B_A3B, // Qwen3 Next
LLM_TYPE_100B_A6B,
LLM_TYPE_106B_A12B, // GLM-4.5-Air

View File

@ -21,8 +21,8 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
// Kimi dimension constants
const int64_t n_head = hparams.n_head();
const int64_t head_dim = hparams.kda_head_dim > 0 ? hparams.kda_head_dim : 128;
const int64_t d_conv = hparams.kda_d_conv > 0 ? hparams.kda_d_conv : 4;
const int64_t head_dim = hparams.kda_head_dim;
const int64_t d_conv = hparams.ssm_d_conv;
const int64_t d_inner = n_head * head_dim; // 32 * 128 = 4096
const int64_t n_seqs = ubatch.n_seqs;
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
@ -33,12 +33,12 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
// MLA params
const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla > 0 ? hparams.n_embd_head_k_mla : 192;
const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla > 0 ? hparams.n_embd_head_v_mla : 128;
const int64_t kv_lora_rank = hparams.n_lora_kv > 0 ? hparams.n_lora_kv : 512;
// qk_rope_head_dim = 64 (from Kimi config), NOT hparams.n_rot (which is 72)
const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla;
const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla;
const int64_t kv_lora_rank = hparams.n_lora_kv;
// qk_rope_head_dim = 64 (from Kimi config) which is hparams.n_rot
// Confirmed from tensor shape: wkv_a_mqa [2304, 576] = [n_embd, kv_lora_rank + qk_rope_head_dim]
const int64_t n_embd_head_qk_rope = 64; // config.qk_rope_head_dim
const int64_t n_embd_head_qk_rope = hparams.n_rot; // config.qk_rope_head_dim
const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope; // 192 - 64 = 128
// Attention scale for KDA (1/sqrt(head_dim))