removed some hard coded code
This commit is contained in:
parent
772ca88070
commit
9f1265fec1
|
|
@ -4990,7 +4990,9 @@ class KimiLinearModel(TextModel):
|
|||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
|
||||
|
||||
self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"])
|
||||
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
|
||||
|
||||
# Use find_hparam for context length
|
||||
# Kimi uses model_max_length
|
||||
n_ctx = self.find_hparam(["max_position_embeddings", "model_max_length", "n_ctx", "n_positions"], optional=True)
|
||||
|
|
@ -5004,6 +5006,18 @@ class KimiLinearModel(TextModel):
|
|||
# KDA & MLA params
|
||||
# Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv
|
||||
linear_attn_config = self.hparams.get("linear_attn_config", {})
|
||||
# n_head == 0 for KDA layers, n_head > 0 for MLA layers
|
||||
# full_attention_layers list will be used to distingush layer type
|
||||
_num_kv_heads = list()
|
||||
_full_attn_layers = linear_attn_config["full_attn_layers"]
|
||||
for il in range(self.hparams["num_hidden_layers"]):
|
||||
if il+1 in _full_attn_layers:
|
||||
_num_kv_heads.append(linear_attn_config["num_heads"])
|
||||
else:
|
||||
_num_kv_heads.append(0)
|
||||
assert(len(_num_kv_heads) == self.hparams["num_hidden_layers"])
|
||||
self.gguf_writer.add_head_count_kv(_num_kv_heads)
|
||||
|
||||
ssm_d_conv = self.hparams.get("ssm_d_conv") or linear_attn_config.get("short_conv_kernel_size")
|
||||
if ssm_d_conv is not None:
|
||||
self.gguf_writer.add_ssm_conv_kernel(ssm_d_conv)
|
||||
|
|
@ -5046,7 +5060,14 @@ class KimiLinearModel(TextModel):
|
|||
head_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||
self.gguf_writer.add_rope_dimension_count(head_dim)
|
||||
|
||||
self.gguf_writer.add_rope_freq_base(self.hparams.get("rope_theta", 10000.0))
|
||||
# Copied from Qwen2Moe as this model inherits parts of it
|
||||
# YaRN is not enabled by default
|
||||
# To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
||||
|
||||
# MoE params
|
||||
n_experts = self.hparams.get("num_local_experts", self.hparams.get("num_experts"))
|
||||
|
|
|
|||
|
|
@ -120,6 +120,7 @@ const char * llm_type_name(llm_type type) {
|
|||
case LLM_TYPE_16B_A1B: return "16B.A1B";
|
||||
case LLM_TYPE_21B_A3B: return "21B.A3B";
|
||||
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
||||
case LLM_TYPE_48B_A3B: return "48B.A3B";
|
||||
case LLM_TYPE_100B_A6B: return "100B.A6B";
|
||||
case LLM_TYPE_106B_A12B: return "106B.A12B";
|
||||
case LLM_TYPE_230B_A10B: return "230B.A10B";
|
||||
|
|
@ -2299,13 +2300,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
// qk_rope_head_dim = 64, qk_nope_head_dim = 128, qk_head_dim = 192
|
||||
|
||||
// Mark KDA layers as recurrent using n_head_kv pattern (like Jamba)
|
||||
// MLA layers are at: 3, 7, 11, 15, 19, 23, 26 (7 MLA layers total)
|
||||
// KDA layers are all others: 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25 (20 KDA layers)
|
||||
// Set n_head_kv = 0 for KDA layers (recurrent), n_head_kv = n_head for MLA layers (attention)
|
||||
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
|
||||
bool is_mla = (i == 3 || i == 7 || i == 11 || i == 15 || i == 19 || i == 23 || i == 26);
|
||||
hparams.n_head_kv_arr[i] = is_mla ? hparams.n_head() : 0;
|
||||
hparams.recurrent_layer_arr[i] = !is_mla; // KDA layers are recurrent
|
||||
hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0; // KDA layers are recurrent
|
||||
}
|
||||
|
||||
// MoE parameters - Kimi uses moe_intermediate_size = 1024
|
||||
|
|
@ -2316,18 +2313,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
||||
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
||||
|
||||
// Default values if not in GGUF
|
||||
if (hparams.n_ff_exp == 0) hparams.n_ff_exp = 1024; // moe_intermediate_size
|
||||
if (hparams.n_ff_shexp == 0) hparams.n_ff_shexp = 9216; // shared_expert_intermediate_size = intermediate_size
|
||||
if (hparams.n_expert_shared == 0) hparams.n_expert_shared = 1; // num_shared_experts
|
||||
if (hparams.n_layer_dense_lead == 0) hparams.n_layer_dense_lead = 1; // first_k_dense_replace
|
||||
if (hparams.expert_weights_scale == 0.0f) hparams.expert_weights_scale = 2.446f; // routed_scaling_factor
|
||||
|
||||
// MoE gating function - Kimi uses sigmoid (moe_router_activation_func: sigmoid)
|
||||
if (hparams.expert_gating_func == 0) hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 27: type = LLM_TYPE_48B; break; // Kimi-Linear-48B-A3B
|
||||
case 27: type = LLM_TYPE_48B_A3B; break; // Kimi-Linear-48B-A3B
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
|
|
@ -7894,6 +7881,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|||
case LLM_ARCH_ARWKV7:
|
||||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||
case LLM_ARCH_NEMOTRON_H:
|
||||
case LLM_ARCH_KIMI_LINEAR:
|
||||
return LLAMA_ROPE_TYPE_NONE;
|
||||
|
||||
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
||||
|
|
@ -7912,7 +7900,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|||
case LLM_ARCH_ARCTIC:
|
||||
case LLM_ARCH_DEEPSEEK:
|
||||
case LLM_ARCH_DEEPSEEK2:
|
||||
case LLM_ARCH_KIMI_LINEAR:
|
||||
case LLM_ARCH_PLM:
|
||||
case LLM_ARCH_CHATGLM:
|
||||
case LLM_ARCH_GLM4:
|
||||
|
|
|
|||
|
|
@ -339,6 +339,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
|
|||
ggml_row_size(kv->type, n_embd_head_qk_nope));
|
||||
k_nope = ggml_cont(ctx0, k_nope);
|
||||
Vcur = ggml_cont(ctx0, Vcur);
|
||||
cb(Vcur, "mla_V", il);
|
||||
|
||||
// Concatenate k_nope + k_pe (broadcast k_pe to all heads)
|
||||
// K = [k_nope, k_pe] where k_nope is [qk_nope_head_dim, n_head, n_tokens]
|
||||
|
|
@ -349,12 +350,11 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
|
|||
ggml_tensor * k_pe_repeated = ggml_repeat(ctx0, k_pe, k_pe_target);
|
||||
ggml_tensor * Kcur = ggml_concat(ctx0, k_nope, k_pe_repeated, 0);
|
||||
cb(Kcur, "mla_K", il);
|
||||
cb(Vcur, "mla_V", il);
|
||||
|
||||
// Direct softmax attention (without KV cache)
|
||||
// Use build_attn with inp_no_cache for proper mask handling
|
||||
cur = build_attn(inp_no_cache, layer.wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il);
|
||||
cb(cur, "mla_out", il);
|
||||
cur = build_attn(inp_no_cache, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il);
|
||||
// cb(cur, "mla_out", il);
|
||||
|
||||
} else {
|
||||
// Unknown layer type - this should not happen
|
||||
|
|
@ -375,18 +375,33 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
|
|||
cur = build_norm(ffn_inp, layer.ffn_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
// FFN / MoE
|
||||
if (layer.ffn_gate_inp) {
|
||||
if ((uint32_t) il < hparams.n_layer_dense_lead) {
|
||||
// Dense FFN layer
|
||||
cur = build_ffn(cur,
|
||||
layer.ffn_up, NULL, NULL,
|
||||
layer.ffn_gate, NULL, NULL,
|
||||
layer.ffn_down, NULL, NULL,
|
||||
NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
} else {
|
||||
// MoE layer
|
||||
// Kimi uses moe_renormalize=True and routed_scaling_factor (stored as expert_weights_scale) = 2.446
|
||||
ggml_tensor * moe_out = build_moe_ffn(cur, layer.ffn_gate_inp, layer.ffn_up_exps, layer.ffn_gate_exps, layer.ffn_down_exps,
|
||||
layer.ffn_exp_probs_b, hparams.n_expert, hparams.n_expert_used,
|
||||
LLM_FFN_SILU, true, true, hparams.expert_weights_scale,
|
||||
(llama_expert_gating_func_type) hparams.expert_gating_func, il);
|
||||
ggml_tensor * moe_out = build_moe_ffn(cur,
|
||||
layer.ffn_gate_inp,
|
||||
layer.ffn_up_exps,
|
||||
layer.ffn_gate_exps,
|
||||
layer.ffn_down_exps,
|
||||
layer.ffn_exp_probs_b,
|
||||
hparams.n_expert,
|
||||
hparams.n_expert_used,
|
||||
LLM_FFN_SILU, true,
|
||||
true, hparams.expert_weights_scale,
|
||||
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
||||
il);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
||||
// Shared expert (if present)
|
||||
if (layer.ffn_gate_shexp) {
|
||||
// Shared expert
|
||||
{
|
||||
ggml_tensor * ffn_shexp = build_ffn(cur,
|
||||
layer.ffn_up_shexp, NULL, NULL,
|
||||
layer.ffn_gate_shexp, NULL, NULL,
|
||||
|
|
@ -396,27 +411,23 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
|
|||
|
||||
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
||||
cb(cur, "ffn_out", il);
|
||||
} else {
|
||||
cur = moe_out;
|
||||
}
|
||||
} else if (layer.ffn_gate) {
|
||||
// Dense FFN layer
|
||||
cur = build_ffn(cur, layer.ffn_up, NULL, NULL, layer.ffn_gate, NULL, NULL,
|
||||
layer.ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
} else {
|
||||
// No FFN - this should not happen in Kimi
|
||||
GGML_ABORT("Kimi layer missing FFN tensors");
|
||||
}
|
||||
|
||||
// Residual
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
inpL = cur;
|
||||
}
|
||||
cur = inpL;
|
||||
|
||||
// Final Norm
|
||||
cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
||||
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
||||
|
||||
cb(cur, "result_norm", -1);
|
||||
res->t_embd = cur;
|
||||
|
||||
// Output
|
||||
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||
|
|
|
|||
Loading…
Reference in New Issue