kimi linear constants.py tensor_mapping.py

This commit is contained in:
Yee Man Chan 2025-12-02 10:40:44 +08:00
parent 84f822c5a5
commit 57cca52779
4 changed files with 102 additions and 44 deletions

View File

@ -4988,6 +4988,7 @@ class KimiLinearModel(TextModel):
_experts: list[dict[str, Tensor]] | None = None
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
# Use find_hparam for context length
@ -5000,14 +5001,6 @@ class KimiLinearModel(TextModel):
logger.warning("No context length found in config, defaulting to 4096")
self.gguf_writer.add_context_length(4096)
self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
self.gguf_writer.add_file_type(self.ftype)
# KDA & MLA params
# Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv
linear_attn_config = self.hparams.get("linear_attn_config", {})
@ -5053,17 +5046,6 @@ class KimiLinearModel(TextModel):
head_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
self.gguf_writer.add_rope_dimension_count(head_dim)
self.gguf_writer.add_rope_freq_base(self.hparams.get("rope_theta", 10000.0))
# MoE params
n_experts = self.hparams.get("num_local_experts", self.hparams.get("num_experts"))
if n_experts is not None:
self.gguf_writer.add_expert_count(n_experts)
# Support both num_experts_per_tok and num_experts_per_token
n_experts_used = self.hparams.get("num_experts_per_tok", self.hparams.get("num_experts_per_token"))
if n_experts_used is not None:
self.gguf_writer.add_expert_used_count(n_experts_used)
# moe_intermediate_size (1024 for Kimi)
moe_intermediate_size = self.hparams.get("moe_intermediate_size")
if moe_intermediate_size is not None:
@ -5079,16 +5061,6 @@ class KimiLinearModel(TextModel):
if first_k_dense_replace is not None:
self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace)
# Expert gating function (sigmoid for Kimi)
moe_router_activation_func = self.hparams.get("moe_router_activation_func", "sigmoid")
if moe_router_activation_func == "sigmoid":
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
elif moe_router_activation_func == "softmax":
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
else:
logger.warning(f"Unknown moe_router_activation_func: {moe_router_activation_func}, defaulting to sigmoid")
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
# Routed scaling factor (expert_weights_scale = 2.446 for Kimi)
routed_scaling_factor = self.hparams.get("routed_scaling_factor")
if routed_scaling_factor is not None:
@ -5220,9 +5192,8 @@ class KimiLinearModel(TextModel):
logger.info(f"A_log {name}: numpy {tuple(data_torch.shape)} -> ggml ne={list(reversed(data_torch.shape))}")
# Kimi specific bias
if name.endswith("block_sparse_moe.gate.e_score_correction_bias"):
new_name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_EXP_PROBS_B, bid)
return [(new_name, data_torch)]
if name.endswith("e_score_correction_bias"):
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
# process the experts separately
if name.find("block_sparse_moe.experts") != -1:
@ -5257,18 +5228,6 @@ class KimiLinearModel(TextModel):
logger.info(f"Returning {mapped_name}: shape after = {tuple(data_torch.shape)}")
return [(mapped_name, data_torch)]
def get_vocab_base(self) -> tuple[list[str], list[int], str]:
# This method is not used when set_vocab is overridden
# But adding it for completeness in case it's called elsewhere
logger.warning("get_vocab_base called, but set_vocab is already overridden")
vocab_size = self.hparams.get("vocab_size", 100)
tokens = [f"<token_{i}>" for i in range(vocab_size)]
tokens[0] = "<unk>"
tokens[1] = "<s>"
tokens[2] = "</s>"
toktypes = [gguf.TokenType.NORMAL] * vocab_size
return tokens, toktypes, "gpt-2"
@ModelBase.register("InternLM2ForCausalLM")
class InternLM2Model(TextModel):
model_arch = gguf.MODEL_ARCH.INTERNLM2

View File

@ -446,6 +446,7 @@ class MODEL_ARCH(IntEnum):
RND1 = auto()
PANGU_EMBED = auto()
MISTRAL3 = auto()
KIMI_LINEAR = auto() # Kimi-Linear (hybrid MLA+KDA)
class VISION_PROJECTOR_TYPE(IntEnum):
@ -535,6 +536,16 @@ class MODEL_TENSOR(IntEnum):
SSM_NORM = auto()
SSM_OUT = auto()
SSM_BETA_ALPHA = auto() # qwen3next
SSM_CONV1D_Q = auto() # Kimi Linear
SSM_CONV1D_K = auto() # Kimi Linear
SSM_CONV1D_V = auto() # Kimi Linear
SSM_F_A = auto() # Kimi Linear
SSM_F_B = auto() # Kimi Linear
SSM_BETA = auto() # Kimi Linear
SSM_A_LOG = auto() # Kimi Linear
SSM_G_A = auto() # Kimi Linear
SSM_G_B = auto() # Kimi Linear
SSM_DT_B = auto() # Kimi Linear
TIME_MIX_W0 = auto()
TIME_MIX_W1 = auto()
TIME_MIX_W2 = auto()
@ -820,6 +831,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.RND1: "rnd1",
MODEL_ARCH.PANGU_EMBED: "pangu-embedded",
MODEL_ARCH.MISTRAL3: "mistral3",
MODEL_ARCH.KIMI_LINEAR: "kimi-linear",
}
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
@ -907,6 +919,16 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.SSM_NORM: "blk.{bid}.ssm_norm",
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
MODEL_TENSOR.SSM_BETA_ALPHA: "blk.{bid}.ssm_ba",
MODEL_TENSOR.SSM_CONV1D_Q: "blk.{bid}.ssm_conv1d_q", # Kimi Linear
MODEL_TENSOR.SSM_CONV1D_K: "blk.{bid}.ssm_conv1d_k", # Kimi Linear
MODEL_TENSOR.SSM_CONV1D_V: "blk.{bid}.ssm_conv1d_v", # Kimi Linear
MODEL_TENSOR.SSM_F_A: "blk.{bid}.ssm_f_a", # Kimi Linear
MODEL_TENSOR.SSM_F_B: "blk.{bid}.ssm_f_b", # Kimi Linear
MODEL_TENSOR.SSM_BETA: "blk.{bid}.ssm_beta", # Kimi Linear
MODEL_TENSOR.SSM_A_LOG: "blk.{bid}.ssm_a", # Kimi Linear
MODEL_TENSOR.SSM_G_A: "blk.{bid}.ssm_g_a", # Kimi Linear
MODEL_TENSOR.SSM_G_B: "blk.{bid}.ssm_g_b", # Kimi Linear
MODEL_TENSOR.SSM_DT_B: "blk.{bid}.ssm_dt", # Kimi Linear
MODEL_TENSOR.TIME_MIX_W0: "blk.{bid}.time_mix_w0",
MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1",
MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2",
@ -3094,6 +3116,45 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
],
MODEL_ARCH.KIMI_LINEAR: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.ATTN_Q_A,
MODEL_TENSOR.ATTN_Q_B,
MODEL_TENSOR.ATTN_KV_A_MQA,
MODEL_TENSOR.ATTN_KV_B,
MODEL_TENSOR.ATTN_Q_A_NORM,
MODEL_TENSOR.ATTN_KV_A_NORM,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_GATE_EXP,
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
MODEL_TENSOR.SSM_CONV1D_Q,
MODEL_TENSOR.SSM_CONV1D_K,
MODEL_TENSOR.SSM_CONV1D_V,
MODEL_TENSOR.SSM_F_A,
MODEL_TENSOR.SSM_F_B,
MODEL_TENSOR.SSM_BETA,
MODEL_TENSOR.SSM_A_LOG,
MODEL_TENSOR.SSM_G_A,
MODEL_TENSOR.SSM_G_B,
MODEL_TENSOR.SSM_NORM,
MODEL_TENSOR.SSM_DT_B,
MODEL_TENSOR.FFN_EXP_PROBS_B,
MODEL_TENSOR.FFN_GATE_SHEXP,
MODEL_TENSOR.FFN_DOWN_SHEXP,
MODEL_TENSOR.FFN_UP_SHEXP,
],
# TODO
}

View File

@ -389,6 +389,7 @@ class TensorNameMap:
"model.layers.{bid}.mlp.expert_bias", # afmoe
"model.layers.{bid}.feed_forward.expert_bias", # lfm2moe
"model.layers.{bid}.block_sparse_moe.e_score_correction", # minimax-m2
"model.layers.{bid}.block_sparse_moe.gate.e_score_correction", # kimi
),
# Feed-forward up
@ -450,6 +451,7 @@ class TensorNameMap:
"model.layers.{bid}.feed_forward.shared_expert.up_proj", # llama4
"model.layers.{bid}.feed_forward.down_proj",
"model.layers.{bid}.mlp.shared_mlp.up_proj", # hunyuan
"model.layers.{bid}.block_sparse_moe.shared_experts.up_proj", # kimi
),
MODEL_TENSOR.FFN_UP_CHEXP: (
@ -500,6 +502,7 @@ class TensorNameMap:
MODEL_TENSOR.FFN_GATE_CHEXP: (
"model.layers.{bid}.mlp.chunk_experts.gate_proj", # grovemoe
"model.layers.{bid}.block_sparse_moe.shared_experts.gate_proj", # kimi
),
# Feed-forward down
@ -557,6 +560,7 @@ class TensorNameMap:
"model.layers.{bid}.feed_forward.shared_expert.down_proj", # llama4
"model.layers.{bid}.shared_mlp.output_linear", # granitemoe
"model.layers.{bid}.mlp.shared_mlp.down_proj", # hunyuan
"model.layers.{bid}.block_sparse_moe.shared_experts.down_proj", # kimi
),
MODEL_TENSOR.FFN_DOWN_CHEXP: (
@ -738,6 +742,7 @@ class TensorNameMap:
"model.layers.{bid}.mamba.norm", # falcon-h1 granite-hybrid
"model.layers.{bid}.linear_attn.norm", # qwen3next
"backbone.layers.{bid}.mixer.norm", # mamba2
"model.layers.{bid}.self_attn.o_norm", # kimi
),
MODEL_TENSOR.SSM_OUT: (
@ -1569,6 +1574,38 @@ class TensorNameMap:
"audio.multi_modal_projector.ln_mid", # ultravox
),
# Kimi Linear KDA (using SSM_ prefix for consistency)
MODEL_TENSOR.SSM_CONV1D_Q: (
"model.layers.{bid}.self_attn.q_conv1d",
),
MODEL_TENSOR.SSM_CONV1D_K: (
"model.layers.{bid}.self_attn.k_conv1d",
),
MODEL_TENSOR.SSM_CONV1D_V: (
"model.layers.{bid}.self_attn.v_conv1d",
),
MODEL_TENSOR.SSM_F_A: (
"model.layers.{bid}.self_attn.f_a_proj",
),
MODEL_TENSOR.SSM_F_B: (
"model.layers.{bid}.self_attn.f_b_proj",
),
MODEL_TENSOR.SSM_BETA: (
"model.layers.{bid}.self_attn.b_proj",
),
MODEL_TENSOR.SSM_A_LOG: (
"model.layers.{bid}.self_attn.A_log",
),
MODEL_TENSOR.SSM_G_A: (
"model.layers.{bid}.self_attn.g_a_proj",
),
MODEL_TENSOR.SSM_G_B: (
"model.layers.{bid}.self_attn.g_b_proj",
),
MODEL_TENSOR.SSM_DT_B: (
"model.layers.{bid}.self_attn.dt_bias",
),
# NextN/MTP tensors for GLM4_MOE
MODEL_TENSOR.NEXTN_EH_PROJ: (
"model.layers.{bid}.eh_proj",

View File

@ -82,6 +82,7 @@ add_library(llama
models/internlm2.cpp
models/jais.cpp
models/jamba.cpp
models/kimi-linear.cpp
models/lfm2.cpp
models/llada-moe.cpp
models/llada.cpp