From 57cca52779d97cf21a5e8fdbb540467353280e0f Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Tue, 2 Dec 2025 10:40:44 +0800 Subject: [PATCH] kimi linear constants.py tensor_mapping.py --- convert_hf_to_gguf.py | 47 ++------------------------ gguf-py/gguf/constants.py | 61 ++++++++++++++++++++++++++++++++++ gguf-py/gguf/tensor_mapping.py | 37 +++++++++++++++++++++ src/CMakeLists.txt | 1 + 4 files changed, 102 insertions(+), 44 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 11dd9f610a..ba21124d6f 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4988,6 +4988,7 @@ class KimiLinearModel(TextModel): _experts: list[dict[str, Tensor]] | None = None def set_gguf_parameters(self): + super().set_gguf_parameters() self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) # Use find_hparam for context length @@ -5000,14 +5001,6 @@ class KimiLinearModel(TextModel): logger.warning("No context length found in config, defaulting to 4096") self.gguf_writer.add_context_length(4096) - self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_file_type(self.ftype) - # KDA & MLA params # Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv linear_attn_config = self.hparams.get("linear_attn_config", {}) @@ -5053,17 +5046,6 @@ class KimiLinearModel(TextModel): head_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(head_dim) - self.gguf_writer.add_rope_freq_base(self.hparams.get("rope_theta", 10000.0)) - - # MoE params - n_experts = self.hparams.get("num_local_experts", self.hparams.get("num_experts")) - if n_experts is not None: - self.gguf_writer.add_expert_count(n_experts) - # Support both num_experts_per_tok and num_experts_per_token - n_experts_used = self.hparams.get("num_experts_per_tok", self.hparams.get("num_experts_per_token")) - if n_experts_used is not None: - self.gguf_writer.add_expert_used_count(n_experts_used) - # moe_intermediate_size (1024 for Kimi) moe_intermediate_size = self.hparams.get("moe_intermediate_size") if moe_intermediate_size is not None: @@ -5079,16 +5061,6 @@ class KimiLinearModel(TextModel): if first_k_dense_replace is not None: self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace) - # Expert gating function (sigmoid for Kimi) - moe_router_activation_func = self.hparams.get("moe_router_activation_func", "sigmoid") - if moe_router_activation_func == "sigmoid": - self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) - elif moe_router_activation_func == "softmax": - self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX) - else: - logger.warning(f"Unknown moe_router_activation_func: {moe_router_activation_func}, defaulting to sigmoid") - self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) - # Routed scaling factor (expert_weights_scale = 2.446 for Kimi) routed_scaling_factor = self.hparams.get("routed_scaling_factor") if routed_scaling_factor is not None: @@ -5220,9 +5192,8 @@ class KimiLinearModel(TextModel): logger.info(f"A_log {name}: numpy {tuple(data_torch.shape)} -> ggml ne={list(reversed(data_torch.shape))}") # Kimi specific bias - if name.endswith("block_sparse_moe.gate.e_score_correction_bias"): - new_name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_EXP_PROBS_B, bid) - return [(new_name, data_torch)] + if name.endswith("e_score_correction_bias"): + name = name.replace("e_score_correction_bias", "e_score_correction.bias") # process the experts separately if name.find("block_sparse_moe.experts") != -1: @@ -5257,18 +5228,6 @@ class KimiLinearModel(TextModel): logger.info(f"Returning {mapped_name}: shape after = {tuple(data_torch.shape)}") return [(mapped_name, data_torch)] - def get_vocab_base(self) -> tuple[list[str], list[int], str]: - # This method is not used when set_vocab is overridden - # But adding it for completeness in case it's called elsewhere - logger.warning("get_vocab_base called, but set_vocab is already overridden") - vocab_size = self.hparams.get("vocab_size", 100) - tokens = [f"" for i in range(vocab_size)] - tokens[0] = "" - tokens[1] = "" - tokens[2] = "" - toktypes = [gguf.TokenType.NORMAL] * vocab_size - return tokens, toktypes, "gpt-2" - @ModelBase.register("InternLM2ForCausalLM") class InternLM2Model(TextModel): model_arch = gguf.MODEL_ARCH.INTERNLM2 diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 2b8489c591..485c41abfb 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -446,6 +446,7 @@ class MODEL_ARCH(IntEnum): RND1 = auto() PANGU_EMBED = auto() MISTRAL3 = auto() + KIMI_LINEAR = auto() # Kimi-Linear (hybrid MLA+KDA) class VISION_PROJECTOR_TYPE(IntEnum): @@ -535,6 +536,16 @@ class MODEL_TENSOR(IntEnum): SSM_NORM = auto() SSM_OUT = auto() SSM_BETA_ALPHA = auto() # qwen3next + SSM_CONV1D_Q = auto() # Kimi Linear + SSM_CONV1D_K = auto() # Kimi Linear + SSM_CONV1D_V = auto() # Kimi Linear + SSM_F_A = auto() # Kimi Linear + SSM_F_B = auto() # Kimi Linear + SSM_BETA = auto() # Kimi Linear + SSM_A_LOG = auto() # Kimi Linear + SSM_G_A = auto() # Kimi Linear + SSM_G_B = auto() # Kimi Linear + SSM_DT_B = auto() # Kimi Linear TIME_MIX_W0 = auto() TIME_MIX_W1 = auto() TIME_MIX_W2 = auto() @@ -820,6 +831,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.RND1: "rnd1", MODEL_ARCH.PANGU_EMBED: "pangu-embedded", MODEL_ARCH.MISTRAL3: "mistral3", + MODEL_ARCH.KIMI_LINEAR: "kimi-linear", } VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = { @@ -907,6 +919,16 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.SSM_NORM: "blk.{bid}.ssm_norm", MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", MODEL_TENSOR.SSM_BETA_ALPHA: "blk.{bid}.ssm_ba", + MODEL_TENSOR.SSM_CONV1D_Q: "blk.{bid}.ssm_conv1d_q", # Kimi Linear + MODEL_TENSOR.SSM_CONV1D_K: "blk.{bid}.ssm_conv1d_k", # Kimi Linear + MODEL_TENSOR.SSM_CONV1D_V: "blk.{bid}.ssm_conv1d_v", # Kimi Linear + MODEL_TENSOR.SSM_F_A: "blk.{bid}.ssm_f_a", # Kimi Linear + MODEL_TENSOR.SSM_F_B: "blk.{bid}.ssm_f_b", # Kimi Linear + MODEL_TENSOR.SSM_BETA: "blk.{bid}.ssm_beta", # Kimi Linear + MODEL_TENSOR.SSM_A_LOG: "blk.{bid}.ssm_a", # Kimi Linear + MODEL_TENSOR.SSM_G_A: "blk.{bid}.ssm_g_a", # Kimi Linear + MODEL_TENSOR.SSM_G_B: "blk.{bid}.ssm_g_b", # Kimi Linear + MODEL_TENSOR.SSM_DT_B: "blk.{bid}.ssm_dt", # Kimi Linear MODEL_TENSOR.TIME_MIX_W0: "blk.{bid}.time_mix_w0", MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1", MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2", @@ -3094,6 +3116,45 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, ], + MODEL_ARCH.KIMI_LINEAR: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_Q_A, + MODEL_TENSOR.ATTN_Q_B, + MODEL_TENSOR.ATTN_KV_A_MQA, + MODEL_TENSOR.ATTN_KV_B, + MODEL_TENSOR.ATTN_Q_A_NORM, + MODEL_TENSOR.ATTN_KV_A_NORM, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.SSM_CONV1D_Q, + MODEL_TENSOR.SSM_CONV1D_K, + MODEL_TENSOR.SSM_CONV1D_V, + MODEL_TENSOR.SSM_F_A, + MODEL_TENSOR.SSM_F_B, + MODEL_TENSOR.SSM_BETA, + MODEL_TENSOR.SSM_A_LOG, + MODEL_TENSOR.SSM_G_A, + MODEL_TENSOR.SSM_G_B, + MODEL_TENSOR.SSM_NORM, + MODEL_TENSOR.SSM_DT_B, + MODEL_TENSOR.FFN_EXP_PROBS_B, + MODEL_TENSOR.FFN_GATE_SHEXP, + MODEL_TENSOR.FFN_DOWN_SHEXP, + MODEL_TENSOR.FFN_UP_SHEXP, + ], # TODO } diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index a7b0973979..cfe541fc41 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -389,6 +389,7 @@ class TensorNameMap: "model.layers.{bid}.mlp.expert_bias", # afmoe "model.layers.{bid}.feed_forward.expert_bias", # lfm2moe "model.layers.{bid}.block_sparse_moe.e_score_correction", # minimax-m2 + "model.layers.{bid}.block_sparse_moe.gate.e_score_correction", # kimi ), # Feed-forward up @@ -450,6 +451,7 @@ class TensorNameMap: "model.layers.{bid}.feed_forward.shared_expert.up_proj", # llama4 "model.layers.{bid}.feed_forward.down_proj", "model.layers.{bid}.mlp.shared_mlp.up_proj", # hunyuan + "model.layers.{bid}.block_sparse_moe.shared_experts.up_proj", # kimi ), MODEL_TENSOR.FFN_UP_CHEXP: ( @@ -500,6 +502,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_GATE_CHEXP: ( "model.layers.{bid}.mlp.chunk_experts.gate_proj", # grovemoe + "model.layers.{bid}.block_sparse_moe.shared_experts.gate_proj", # kimi ), # Feed-forward down @@ -557,6 +560,7 @@ class TensorNameMap: "model.layers.{bid}.feed_forward.shared_expert.down_proj", # llama4 "model.layers.{bid}.shared_mlp.output_linear", # granitemoe "model.layers.{bid}.mlp.shared_mlp.down_proj", # hunyuan + "model.layers.{bid}.block_sparse_moe.shared_experts.down_proj", # kimi ), MODEL_TENSOR.FFN_DOWN_CHEXP: ( @@ -738,6 +742,7 @@ class TensorNameMap: "model.layers.{bid}.mamba.norm", # falcon-h1 granite-hybrid "model.layers.{bid}.linear_attn.norm", # qwen3next "backbone.layers.{bid}.mixer.norm", # mamba2 + "model.layers.{bid}.self_attn.o_norm", # kimi ), MODEL_TENSOR.SSM_OUT: ( @@ -1569,6 +1574,38 @@ class TensorNameMap: "audio.multi_modal_projector.ln_mid", # ultravox ), + # Kimi Linear KDA (using SSM_ prefix for consistency) + MODEL_TENSOR.SSM_CONV1D_Q: ( + "model.layers.{bid}.self_attn.q_conv1d", + ), + MODEL_TENSOR.SSM_CONV1D_K: ( + "model.layers.{bid}.self_attn.k_conv1d", + ), + MODEL_TENSOR.SSM_CONV1D_V: ( + "model.layers.{bid}.self_attn.v_conv1d", + ), + MODEL_TENSOR.SSM_F_A: ( + "model.layers.{bid}.self_attn.f_a_proj", + ), + MODEL_TENSOR.SSM_F_B: ( + "model.layers.{bid}.self_attn.f_b_proj", + ), + MODEL_TENSOR.SSM_BETA: ( + "model.layers.{bid}.self_attn.b_proj", + ), + MODEL_TENSOR.SSM_A_LOG: ( + "model.layers.{bid}.self_attn.A_log", + ), + MODEL_TENSOR.SSM_G_A: ( + "model.layers.{bid}.self_attn.g_a_proj", + ), + MODEL_TENSOR.SSM_G_B: ( + "model.layers.{bid}.self_attn.g_b_proj", + ), + MODEL_TENSOR.SSM_DT_B: ( + "model.layers.{bid}.self_attn.dt_bias", + ), + # NextN/MTP tensors for GLM4_MOE MODEL_TENSOR.NEXTN_EH_PROJ: ( "model.layers.{bid}.eh_proj", diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index fbd538109b..fbfcf05c70 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -82,6 +82,7 @@ add_library(llama models/internlm2.cpp models/jais.cpp models/jamba.cpp + models/kimi-linear.cpp models/lfm2.cpp models/llada-moe.cpp models/llada.cpp