removed some hard coded code

This commit is contained in:
Yee Man Chan 2025-12-05 19:51:02 +08:00
parent 772ca88070
commit 9f1265fec1
3 changed files with 61 additions and 42 deletions

View File

@ -4990,7 +4990,9 @@ class KimiLinearModel(TextModel):
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"])
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
# Use find_hparam for context length
# Kimi uses model_max_length
n_ctx = self.find_hparam(["max_position_embeddings", "model_max_length", "n_ctx", "n_positions"], optional=True)
@ -5004,6 +5006,18 @@ class KimiLinearModel(TextModel):
# KDA & MLA params
# Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv
linear_attn_config = self.hparams.get("linear_attn_config", {})
# n_head == 0 for KDA layers, n_head > 0 for MLA layers
# full_attention_layers list will be used to distingush layer type
_num_kv_heads = list()
_full_attn_layers = linear_attn_config["full_attn_layers"]
for il in range(self.hparams["num_hidden_layers"]):
if il+1 in _full_attn_layers:
_num_kv_heads.append(linear_attn_config["num_heads"])
else:
_num_kv_heads.append(0)
assert(len(_num_kv_heads) == self.hparams["num_hidden_layers"])
self.gguf_writer.add_head_count_kv(_num_kv_heads)
ssm_d_conv = self.hparams.get("ssm_d_conv") or linear_attn_config.get("short_conv_kernel_size")
if ssm_d_conv is not None:
self.gguf_writer.add_ssm_conv_kernel(ssm_d_conv)
@ -5046,7 +5060,14 @@ class KimiLinearModel(TextModel):
head_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
self.gguf_writer.add_rope_dimension_count(head_dim)
self.gguf_writer.add_rope_freq_base(self.hparams.get("rope_theta", 10000.0))
# Copied from Qwen2Moe as this model inherits parts of it
# YaRN is not enabled by default
# To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
rope_scaling = self.hparams.get("rope_scaling") or {}
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
# MoE params
n_experts = self.hparams.get("num_local_experts", self.hparams.get("num_experts"))

View File

@ -120,6 +120,7 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_16B_A1B: return "16B.A1B";
case LLM_TYPE_21B_A3B: return "21B.A3B";
case LLM_TYPE_30B_A3B: return "30B.A3B";
case LLM_TYPE_48B_A3B: return "48B.A3B";
case LLM_TYPE_100B_A6B: return "100B.A6B";
case LLM_TYPE_106B_A12B: return "106B.A12B";
case LLM_TYPE_230B_A10B: return "230B.A10B";
@ -2299,13 +2300,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
// qk_rope_head_dim = 64, qk_nope_head_dim = 128, qk_head_dim = 192
// Mark KDA layers as recurrent using n_head_kv pattern (like Jamba)
// MLA layers are at: 3, 7, 11, 15, 19, 23, 26 (7 MLA layers total)
// KDA layers are all others: 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25 (20 KDA layers)
// Set n_head_kv = 0 for KDA layers (recurrent), n_head_kv = n_head for MLA layers (attention)
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
bool is_mla = (i == 3 || i == 7 || i == 11 || i == 15 || i == 19 || i == 23 || i == 26);
hparams.n_head_kv_arr[i] = is_mla ? hparams.n_head() : 0;
hparams.recurrent_layer_arr[i] = !is_mla; // KDA layers are recurrent
hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0; // KDA layers are recurrent
}
// MoE parameters - Kimi uses moe_intermediate_size = 1024
@ -2316,18 +2313,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
// Default values if not in GGUF
if (hparams.n_ff_exp == 0) hparams.n_ff_exp = 1024; // moe_intermediate_size
if (hparams.n_ff_shexp == 0) hparams.n_ff_shexp = 9216; // shared_expert_intermediate_size = intermediate_size
if (hparams.n_expert_shared == 0) hparams.n_expert_shared = 1; // num_shared_experts
if (hparams.n_layer_dense_lead == 0) hparams.n_layer_dense_lead = 1; // first_k_dense_replace
if (hparams.expert_weights_scale == 0.0f) hparams.expert_weights_scale = 2.446f; // routed_scaling_factor
// MoE gating function - Kimi uses sigmoid (moe_router_activation_func: sigmoid)
if (hparams.expert_gating_func == 0) hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
switch (hparams.n_layer) {
case 27: type = LLM_TYPE_48B; break; // Kimi-Linear-48B-A3B
case 27: type = LLM_TYPE_48B_A3B; break; // Kimi-Linear-48B-A3B
default: type = LLM_TYPE_UNKNOWN;
}
} break;
@ -7894,6 +7881,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_ARWKV7:
case LLM_ARCH_WAVTOKENIZER_DEC:
case LLM_ARCH_NEMOTRON_H:
case LLM_ARCH_KIMI_LINEAR:
return LLAMA_ROPE_TYPE_NONE;
// use what we call a normal RoPE, operating on pairs of consecutive head values
@ -7912,7 +7900,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_ARCTIC:
case LLM_ARCH_DEEPSEEK:
case LLM_ARCH_DEEPSEEK2:
case LLM_ARCH_KIMI_LINEAR:
case LLM_ARCH_PLM:
case LLM_ARCH_CHATGLM:
case LLM_ARCH_GLM4:

View File

@ -339,6 +339,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
ggml_row_size(kv->type, n_embd_head_qk_nope));
k_nope = ggml_cont(ctx0, k_nope);
Vcur = ggml_cont(ctx0, Vcur);
cb(Vcur, "mla_V", il);
// Concatenate k_nope + k_pe (broadcast k_pe to all heads)
// K = [k_nope, k_pe] where k_nope is [qk_nope_head_dim, n_head, n_tokens]
@ -349,12 +350,11 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
ggml_tensor * k_pe_repeated = ggml_repeat(ctx0, k_pe, k_pe_target);
ggml_tensor * Kcur = ggml_concat(ctx0, k_nope, k_pe_repeated, 0);
cb(Kcur, "mla_K", il);
cb(Vcur, "mla_V", il);
// Direct softmax attention (without KV cache)
// Use build_attn with inp_no_cache for proper mask handling
cur = build_attn(inp_no_cache, layer.wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il);
cb(cur, "mla_out", il);
cur = build_attn(inp_no_cache, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il);
// cb(cur, "mla_out", il);
} else {
// Unknown layer type - this should not happen
@ -375,18 +375,33 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
cur = build_norm(ffn_inp, layer.ffn_norm, NULL, LLM_NORM_RMS, il);
cb(cur, "ffn_norm", il);
// FFN / MoE
if (layer.ffn_gate_inp) {
if ((uint32_t) il < hparams.n_layer_dense_lead) {
// Dense FFN layer
cur = build_ffn(cur,
layer.ffn_up, NULL, NULL,
layer.ffn_gate, NULL, NULL,
layer.ffn_down, NULL, NULL,
NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
} else {
// MoE layer
// Kimi uses moe_renormalize=True and routed_scaling_factor (stored as expert_weights_scale) = 2.446
ggml_tensor * moe_out = build_moe_ffn(cur, layer.ffn_gate_inp, layer.ffn_up_exps, layer.ffn_gate_exps, layer.ffn_down_exps,
layer.ffn_exp_probs_b, hparams.n_expert, hparams.n_expert_used,
LLM_FFN_SILU, true, true, hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func, il);
ggml_tensor * moe_out = build_moe_ffn(cur,
layer.ffn_gate_inp,
layer.ffn_up_exps,
layer.ffn_gate_exps,
layer.ffn_down_exps,
layer.ffn_exp_probs_b,
hparams.n_expert,
hparams.n_expert_used,
LLM_FFN_SILU, true,
true, hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
cb(moe_out, "ffn_moe_out", il);
// Shared expert (if present)
if (layer.ffn_gate_shexp) {
// Shared expert
{
ggml_tensor * ffn_shexp = build_ffn(cur,
layer.ffn_up_shexp, NULL, NULL,
layer.ffn_gate_shexp, NULL, NULL,
@ -396,27 +411,23 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
cur = ggml_add(ctx0, moe_out, ffn_shexp);
cb(cur, "ffn_out", il);
} else {
cur = moe_out;
}
} else if (layer.ffn_gate) {
// Dense FFN layer
cur = build_ffn(cur, layer.ffn_up, NULL, NULL, layer.ffn_gate, NULL, NULL,
layer.ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
} else {
// No FFN - this should not happen in Kimi
GGML_ABORT("Kimi layer missing FFN tensors");
}
// Residual
cur = ggml_add(ctx0, cur, ffn_inp);
cur = build_cvec(cur, il);
cb(cur, "l_out", il);
inpL = cur;
}
cur = inpL;
// Final Norm
cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1);
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
cb(cur, "result_norm", -1);
res->t_embd = cur;
// Output
cur = ggml_mul_mat(ctx0, model.output, cur);