removed all hard code
This commit is contained in:
parent
9f1265fec1
commit
a0269af292
|
|
@ -5021,6 +5021,13 @@ class KimiLinearModel(TextModel):
|
|||
ssm_d_conv = self.hparams.get("ssm_d_conv") or linear_attn_config.get("short_conv_kernel_size")
|
||||
if ssm_d_conv is not None:
|
||||
self.gguf_writer.add_ssm_conv_kernel(ssm_d_conv)
|
||||
|
||||
kda_head_dim = self.hparams.get("kda_head_dim") or linear_attn_config.get("head_dim")
|
||||
|
||||
if kda_head_dim is not None:
|
||||
self.gguf_writer.add_kda_head_dim(kda_head_dim)
|
||||
|
||||
# MLA params - use add_* methods that handle arch substitution
|
||||
|
||||
# MLA params - use add_* methods that handle arch substitution
|
||||
# Support both HuggingFace naming (q_lora_rank, kv_lora_rank) and internal naming (n_lora_q, n_lora_kv)
|
||||
|
|
@ -5035,8 +5042,9 @@ class KimiLinearModel(TextModel):
|
|||
# MLA head dimensions
|
||||
# Support HuggingFace naming: qk_nope_head_dim, qk_rope_head_dim, v_head_dim
|
||||
qk_nope_head_dim = self.hparams.get("qk_nope_head_dim")
|
||||
qk_rope_head_dim = self.hparams.get("qk_rope_head_dim", self.hparams.get("n_rot"))
|
||||
qk_rope_head_dim = self.hparams.get("qk_rope_head_dim")
|
||||
v_head_dim = self.hparams.get("v_head_dim")
|
||||
self.gguf_writer.add_rope_dimension_count(self.hparams["qk_rope_head_dim"])
|
||||
|
||||
# Calculate n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim
|
||||
if "n_embd_head_k_mla" in self.hparams:
|
||||
|
|
|
|||
|
|
@ -205,6 +205,9 @@ class Keys:
|
|||
GROUP_COUNT = "{arch}.ssm.group_count"
|
||||
DT_B_C_RMS = "{arch}.ssm.dt_b_c_rms"
|
||||
|
||||
class KDA:
|
||||
HEAD_DIM = "{arch}.kda.head_dim"
|
||||
|
||||
class WKV:
|
||||
HEAD_SIZE = "{arch}.wkv.head_size"
|
||||
|
||||
|
|
@ -3475,6 +3478,9 @@ KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK
|
|||
KEY_SSM_GROUP_COUNT = Keys.SSM.GROUP_COUNT
|
||||
KEY_SSM_DT_B_C_RMS = Keys.SSM.DT_B_C_RMS
|
||||
|
||||
# KDA
|
||||
KEY_KDA_HEAD_DIM = Keys.KDA.HEAD_DIM
|
||||
|
||||
# tokenization
|
||||
KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL
|
||||
KEY_TOKENIZER_PRE = Keys.Tokenizer.PRE
|
||||
|
|
|
|||
|
|
@ -970,6 +970,9 @@ class GGUFWriter:
|
|||
def add_ssm_dt_b_c_rms(self, value: bool) -> None:
|
||||
self.add_bool(Keys.SSM.DT_B_C_RMS.format(arch=self.arch), value)
|
||||
|
||||
def add_kda_head_dim(self, value: int) -> None:
|
||||
self.add_uint32(Keys.KDA.HEAD_DIM.format(arch=self.arch), value)
|
||||
|
||||
def add_tokenizer_model(self, model: str) -> None:
|
||||
self.add_string(Keys.Tokenizer.MODEL, model)
|
||||
|
||||
|
|
|
|||
|
|
@ -236,6 +236,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|||
{ LLM_KV_SSM_GROUP_COUNT, "%s.ssm.group_count" },
|
||||
{ LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" },
|
||||
|
||||
{ LLM_KV_KDA_HEAD_DIM, "%s.kda.head_dim" },
|
||||
|
||||
{ LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" },
|
||||
|
||||
{ LLM_KV_POSNET_EMBEDDING_LENGTH, "%s.posnet.embedding_length" },
|
||||
|
|
|
|||
|
|
@ -240,6 +240,8 @@ enum llm_kv {
|
|||
LLM_KV_SSM_GROUP_COUNT,
|
||||
LLM_KV_SSM_DT_B_C_RMS,
|
||||
|
||||
LLM_KV_KDA_HEAD_DIM,
|
||||
|
||||
LLM_KV_WKV_HEAD_SIZE,
|
||||
|
||||
LLM_KV_TOKENIZER_MODEL,
|
||||
|
|
|
|||
|
|
@ -137,7 +137,7 @@ uint32_t llama_hparams::n_embd_r() const {
|
|||
// for Kimi KDA layers
|
||||
// Conv state for Q, K, V: 3 * (d_conv - 1) * n_head * head_dim
|
||||
const uint32_t d_inner = n_head() * kda_head_dim; // 32 * 128 = 4096
|
||||
return 3 * (kda_d_conv > 0 ? kda_d_conv - 1 : 3) * d_inner;
|
||||
return 3 * (ssm_d_conv > 0 ? ssm_d_conv - 1 : 3) * d_inner;
|
||||
}
|
||||
|
||||
// TODO: maybe support other convolution strides than 1
|
||||
|
|
|
|||
|
|
@ -133,9 +133,8 @@ struct llama_hparams {
|
|||
uint32_t ssm_dt_rank = 0;
|
||||
uint32_t ssm_n_group = 0;
|
||||
|
||||
// for Kimi Delta Attention (KDA)
|
||||
uint32_t kda_head_dim = 0; // head_dim for KDA layers (128 for Kimi)
|
||||
uint32_t kda_d_conv = 0; // conv kernel size for KDA (4 for Kimi)
|
||||
// for Kimi Linear KDA
|
||||
uint32_t kda_head_dim = 0;
|
||||
|
||||
// for hybrid state space models
|
||||
std::array<bool, LLAMA_MAX_LAYERS> recurrent_layer_arr;
|
||||
|
|
|
|||
|
|
@ -2291,10 +2291,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv, false);
|
||||
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
|
||||
|
||||
// KDA (Delta Attention) parameters
|
||||
hparams.kda_head_dim = 128; // linear_attn_config.head_dim
|
||||
hparams.kda_d_conv = 4; // linear_attn_config.short_conv_kernel_size
|
||||
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv, false);
|
||||
ml.get_key(LLM_KV_KDA_HEAD_DIM, hparams.kda_head_dim, false);
|
||||
|
||||
// MLA qk_rope_head_dim (for reference)
|
||||
// qk_rope_head_dim = 64, qk_nope_head_dim = 128, qk_head_dim = 192
|
||||
|
|
@ -6447,9 +6445,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
// Assuming KDA layer if KDA tensors are present
|
||||
|
||||
// KDA uses head_dim = 128 (from linear_attn_config.head_dim)
|
||||
const int64_t n_embd_head_k_kda = 128;
|
||||
const int64_t n_embd_head_v_kda = 128;
|
||||
const int64_t ssm_d_conv = hparams.ssm_d_conv > 0 ? hparams.ssm_d_conv : 4;
|
||||
const int64_t n_embd_head_k_kda = hparams.kda_head_dim;
|
||||
const int64_t n_embd_head_v_kda = hparams.kda_head_dim;
|
||||
const int64_t ssm_d_conv = hparams.ssm_d_conv;
|
||||
|
||||
// Try loading KDA specific tensors (using SSM_ prefix)
|
||||
// Conv1d weights: try 4D first, then 3D (quantization may remove trailing 1)
|
||||
|
|
@ -6513,8 +6511,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
// MLA Layer - use MLA-specific head dimensions
|
||||
const int64_t q_lora_rank = hparams.n_lora_q;
|
||||
const int64_t kv_lora_rank = hparams.n_lora_kv;
|
||||
const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla > 0 ? hparams.n_embd_head_k_mla : 192;
|
||||
const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla > 0 ? hparams.n_embd_head_v_mla : 128;
|
||||
const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla;
|
||||
const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla;
|
||||
|
||||
layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, TENSOR_NOT_REQUIRED);
|
||||
layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
|
||||
|
|
@ -6529,7 +6527,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
|
||||
// Kimi: qk_rope_head_dim = 64 (actual RoPE dimension for MLA)
|
||||
// Note: hparams.n_rot may be 72 (from conversion) but actual is 64
|
||||
const int64_t qk_rope_head_dim = 64; // From config: qk_rope_head_dim
|
||||
const int64_t qk_rope_head_dim = hparams.n_rot; // From config: qk_rope_head_dim
|
||||
layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + qk_rope_head_dim}, 0);
|
||||
layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_k_mla - qk_rope_head_dim + n_embd_head_v_mla)}, 0);
|
||||
|
||||
|
|
@ -6539,7 +6537,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
// MoE intermediate size (different from dense FFN)
|
||||
const int64_t n_ff_exp = hparams.n_ff_exp > 0 ? hparams.n_ff_exp : 1024;
|
||||
const int64_t n_ff_exp = hparams.n_ff_exp;
|
||||
|
||||
// Kimi uses n_layer_dense_lead to determine which layers use dense FFN vs MoE
|
||||
// first_k_dense_replace = 1 means layer 0 uses dense FFN, layers 1+ use MoE
|
||||
|
|
|
|||
|
|
@ -84,7 +84,6 @@ enum llm_type {
|
|||
LLM_TYPE_35B,
|
||||
LLM_TYPE_36B,
|
||||
LLM_TYPE_40B,
|
||||
LLM_TYPE_48B,
|
||||
LLM_TYPE_65B,
|
||||
LLM_TYPE_70B,
|
||||
LLM_TYPE_120B,
|
||||
|
|
@ -114,6 +113,7 @@ enum llm_type {
|
|||
LLM_TYPE_16B_A1B,
|
||||
LLM_TYPE_21B_A3B, // Ernie MoE small
|
||||
LLM_TYPE_30B_A3B,
|
||||
LLM_TYPE_48B_A3B, // Kimi Linear
|
||||
LLM_TYPE_80B_A3B, // Qwen3 Next
|
||||
LLM_TYPE_100B_A6B,
|
||||
LLM_TYPE_106B_A12B, // GLM-4.5-Air
|
||||
|
|
|
|||
|
|
@ -21,8 +21,8 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
|
|||
|
||||
// Kimi dimension constants
|
||||
const int64_t n_head = hparams.n_head();
|
||||
const int64_t head_dim = hparams.kda_head_dim > 0 ? hparams.kda_head_dim : 128;
|
||||
const int64_t d_conv = hparams.kda_d_conv > 0 ? hparams.kda_d_conv : 4;
|
||||
const int64_t head_dim = hparams.kda_head_dim;
|
||||
const int64_t d_conv = hparams.ssm_d_conv;
|
||||
const int64_t d_inner = n_head * head_dim; // 32 * 128 = 4096
|
||||
const int64_t n_seqs = ubatch.n_seqs;
|
||||
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
||||
|
|
@ -33,12 +33,12 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
|
|||
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
|
||||
|
||||
// MLA params
|
||||
const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla > 0 ? hparams.n_embd_head_k_mla : 192;
|
||||
const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla > 0 ? hparams.n_embd_head_v_mla : 128;
|
||||
const int64_t kv_lora_rank = hparams.n_lora_kv > 0 ? hparams.n_lora_kv : 512;
|
||||
// qk_rope_head_dim = 64 (from Kimi config), NOT hparams.n_rot (which is 72)
|
||||
const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla;
|
||||
const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla;
|
||||
const int64_t kv_lora_rank = hparams.n_lora_kv;
|
||||
// qk_rope_head_dim = 64 (from Kimi config) which is hparams.n_rot
|
||||
// Confirmed from tensor shape: wkv_a_mqa [2304, 576] = [n_embd, kv_lora_rank + qk_rope_head_dim]
|
||||
const int64_t n_embd_head_qk_rope = 64; // config.qk_rope_head_dim
|
||||
const int64_t n_embd_head_qk_rope = hparams.n_rot; // config.qk_rope_head_dim
|
||||
const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope; // 192 - 64 = 128
|
||||
|
||||
// Attention scale for KDA (1/sqrt(head_dim))
|
||||
|
|
|
|||
Loading…
Reference in New Issue