From 9f1265fec16598cc9c24ae31ae38c3ae7aaa3bde Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Fri, 5 Dec 2025 19:51:02 +0800 Subject: [PATCH] removed some hard coded code --- convert_hf_to_gguf.py | 25 +++++++++++++++-- src/llama-model.cpp | 21 +++----------- src/models/kimi-linear.cpp | 57 +++++++++++++++++++++++--------------- 3 files changed, 61 insertions(+), 42 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 2808b72d76..9c36c84189 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4990,7 +4990,9 @@ class KimiLinearModel(TextModel): def set_gguf_parameters(self): super().set_gguf_parameters() self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) - + self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"]) + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) + # Use find_hparam for context length # Kimi uses model_max_length n_ctx = self.find_hparam(["max_position_embeddings", "model_max_length", "n_ctx", "n_positions"], optional=True) @@ -5004,6 +5006,18 @@ class KimiLinearModel(TextModel): # KDA & MLA params # Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv linear_attn_config = self.hparams.get("linear_attn_config", {}) + # n_head == 0 for KDA layers, n_head > 0 for MLA layers + # full_attention_layers list will be used to distingush layer type + _num_kv_heads = list() + _full_attn_layers = linear_attn_config["full_attn_layers"] + for il in range(self.hparams["num_hidden_layers"]): + if il+1 in _full_attn_layers: + _num_kv_heads.append(linear_attn_config["num_heads"]) + else: + _num_kv_heads.append(0) + assert(len(_num_kv_heads) == self.hparams["num_hidden_layers"]) + self.gguf_writer.add_head_count_kv(_num_kv_heads) + ssm_d_conv = self.hparams.get("ssm_d_conv") or linear_attn_config.get("short_conv_kernel_size") if ssm_d_conv is not None: self.gguf_writer.add_ssm_conv_kernel(ssm_d_conv) @@ -5046,7 +5060,14 @@ class KimiLinearModel(TextModel): head_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(head_dim) - self.gguf_writer.add_rope_freq_base(self.hparams.get("rope_theta", 10000.0)) + # Copied from Qwen2Moe as this model inherits parts of it + # YaRN is not enabled by default + # To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts + rope_scaling = self.hparams.get("rope_scaling") or {} + if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) # MoE params n_experts = self.hparams.get("num_local_experts", self.hparams.get("num_experts")) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 763f0dfecb..0f162cdd7a 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -120,6 +120,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_16B_A1B: return "16B.A1B"; case LLM_TYPE_21B_A3B: return "21B.A3B"; case LLM_TYPE_30B_A3B: return "30B.A3B"; + case LLM_TYPE_48B_A3B: return "48B.A3B"; case LLM_TYPE_100B_A6B: return "100B.A6B"; case LLM_TYPE_106B_A12B: return "106B.A12B"; case LLM_TYPE_230B_A10B: return "230B.A10B"; @@ -2299,13 +2300,9 @@ void llama_model::load_hparams(llama_model_loader & ml) { // qk_rope_head_dim = 64, qk_nope_head_dim = 128, qk_head_dim = 192 // Mark KDA layers as recurrent using n_head_kv pattern (like Jamba) - // MLA layers are at: 3, 7, 11, 15, 19, 23, 26 (7 MLA layers total) - // KDA layers are all others: 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25 (20 KDA layers) // Set n_head_kv = 0 for KDA layers (recurrent), n_head_kv = n_head for MLA layers (attention) for (uint32_t i = 0; i < hparams.n_layer; ++i) { - bool is_mla = (i == 3 || i == 7 || i == 11 || i == 15 || i == 19 || i == 23 || i == 26); - hparams.n_head_kv_arr[i] = is_mla ? hparams.n_head() : 0; - hparams.recurrent_layer_arr[i] = !is_mla; // KDA layers are recurrent + hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0; // KDA layers are recurrent } // MoE parameters - Kimi uses moe_intermediate_size = 1024 @@ -2316,18 +2313,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false); - // Default values if not in GGUF - if (hparams.n_ff_exp == 0) hparams.n_ff_exp = 1024; // moe_intermediate_size - if (hparams.n_ff_shexp == 0) hparams.n_ff_shexp = 9216; // shared_expert_intermediate_size = intermediate_size - if (hparams.n_expert_shared == 0) hparams.n_expert_shared = 1; // num_shared_experts - if (hparams.n_layer_dense_lead == 0) hparams.n_layer_dense_lead = 1; // first_k_dense_replace - if (hparams.expert_weights_scale == 0.0f) hparams.expert_weights_scale = 2.446f; // routed_scaling_factor - - // MoE gating function - Kimi uses sigmoid (moe_router_activation_func: sigmoid) - if (hparams.expert_gating_func == 0) hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID; - switch (hparams.n_layer) { - case 27: type = LLM_TYPE_48B; break; // Kimi-Linear-48B-A3B + case 27: type = LLM_TYPE_48B_A3B; break; // Kimi-Linear-48B-A3B default: type = LLM_TYPE_UNKNOWN; } } break; @@ -7894,6 +7881,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_ARWKV7: case LLM_ARCH_WAVTOKENIZER_DEC: case LLM_ARCH_NEMOTRON_H: + case LLM_ARCH_KIMI_LINEAR: return LLAMA_ROPE_TYPE_NONE; // use what we call a normal RoPE, operating on pairs of consecutive head values @@ -7912,7 +7900,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_ARCTIC: case LLM_ARCH_DEEPSEEK: case LLM_ARCH_DEEPSEEK2: - case LLM_ARCH_KIMI_LINEAR: case LLM_ARCH_PLM: case LLM_ARCH_CHATGLM: case LLM_ARCH_GLM4: diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index 660cd06f0e..40fbe469b3 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -339,6 +339,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll ggml_row_size(kv->type, n_embd_head_qk_nope)); k_nope = ggml_cont(ctx0, k_nope); Vcur = ggml_cont(ctx0, Vcur); + cb(Vcur, "mla_V", il); // Concatenate k_nope + k_pe (broadcast k_pe to all heads) // K = [k_nope, k_pe] where k_nope is [qk_nope_head_dim, n_head, n_tokens] @@ -349,12 +350,11 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll ggml_tensor * k_pe_repeated = ggml_repeat(ctx0, k_pe, k_pe_target); ggml_tensor * Kcur = ggml_concat(ctx0, k_nope, k_pe_repeated, 0); cb(Kcur, "mla_K", il); - cb(Vcur, "mla_V", il); // Direct softmax attention (without KV cache) // Use build_attn with inp_no_cache for proper mask handling - cur = build_attn(inp_no_cache, layer.wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il); - cb(cur, "mla_out", il); + cur = build_attn(inp_no_cache, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il); +// cb(cur, "mla_out", il); } else { // Unknown layer type - this should not happen @@ -375,18 +375,33 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll cur = build_norm(ffn_inp, layer.ffn_norm, NULL, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - // FFN / MoE - if (layer.ffn_gate_inp) { + if ((uint32_t) il < hparams.n_layer_dense_lead) { + // Dense FFN layer + cur = build_ffn(cur, + layer.ffn_up, NULL, NULL, + layer.ffn_gate, NULL, NULL, + layer.ffn_down, NULL, NULL, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } else { // MoE layer // Kimi uses moe_renormalize=True and routed_scaling_factor (stored as expert_weights_scale) = 2.446 - ggml_tensor * moe_out = build_moe_ffn(cur, layer.ffn_gate_inp, layer.ffn_up_exps, layer.ffn_gate_exps, layer.ffn_down_exps, - layer.ffn_exp_probs_b, hparams.n_expert, hparams.n_expert_used, - LLM_FFN_SILU, true, true, hparams.expert_weights_scale, - (llama_expert_gating_func_type) hparams.expert_gating_func, il); + ggml_tensor * moe_out = build_moe_ffn(cur, + layer.ffn_gate_inp, + layer.ffn_up_exps, + layer.ffn_gate_exps, + layer.ffn_down_exps, + layer.ffn_exp_probs_b, + hparams.n_expert, + hparams.n_expert_used, + LLM_FFN_SILU, true, + true, hparams.expert_weights_scale, + (llama_expert_gating_func_type) hparams.expert_gating_func, + il); cb(moe_out, "ffn_moe_out", il); - // Shared expert (if present) - if (layer.ffn_gate_shexp) { + // Shared expert + { ggml_tensor * ffn_shexp = build_ffn(cur, layer.ffn_up_shexp, NULL, NULL, layer.ffn_gate_shexp, NULL, NULL, @@ -396,27 +411,23 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll cur = ggml_add(ctx0, moe_out, ffn_shexp); cb(cur, "ffn_out", il); - } else { - cur = moe_out; } - } else if (layer.ffn_gate) { - // Dense FFN layer - cur = build_ffn(cur, layer.ffn_up, NULL, NULL, layer.ffn_gate, NULL, NULL, - layer.ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); - cb(cur, "ffn_out", il); - } else { - // No FFN - this should not happen in Kimi - GGML_ABORT("Kimi layer missing FFN tensors"); } - // Residual cur = ggml_add(ctx0, cur, ffn_inp); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + inpL = cur; } + cur = inpL; // Final Norm - cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1); + cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res->t_embd = cur; // Output cur = ggml_mul_mat(ctx0, model.output, cur);