kimi linear constants.py tensor_mapping.py
This commit is contained in:
parent
84f822c5a5
commit
57cca52779
|
|
@ -4988,6 +4988,7 @@ class KimiLinearModel(TextModel):
|
|||
_experts: list[dict[str, Tensor]] | None = None
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
|
||||
|
||||
# Use find_hparam for context length
|
||||
|
|
@ -5000,14 +5001,6 @@ class KimiLinearModel(TextModel):
|
|||
logger.warning("No context length found in config, defaulting to 4096")
|
||||
self.gguf_writer.add_context_length(4096)
|
||||
|
||||
self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
|
||||
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
||||
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
||||
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
|
||||
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
|
||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
# KDA & MLA params
|
||||
# Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv
|
||||
linear_attn_config = self.hparams.get("linear_attn_config", {})
|
||||
|
|
@ -5053,17 +5046,6 @@ class KimiLinearModel(TextModel):
|
|||
head_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||
self.gguf_writer.add_rope_dimension_count(head_dim)
|
||||
|
||||
self.gguf_writer.add_rope_freq_base(self.hparams.get("rope_theta", 10000.0))
|
||||
|
||||
# MoE params
|
||||
n_experts = self.hparams.get("num_local_experts", self.hparams.get("num_experts"))
|
||||
if n_experts is not None:
|
||||
self.gguf_writer.add_expert_count(n_experts)
|
||||
# Support both num_experts_per_tok and num_experts_per_token
|
||||
n_experts_used = self.hparams.get("num_experts_per_tok", self.hparams.get("num_experts_per_token"))
|
||||
if n_experts_used is not None:
|
||||
self.gguf_writer.add_expert_used_count(n_experts_used)
|
||||
|
||||
# moe_intermediate_size (1024 for Kimi)
|
||||
moe_intermediate_size = self.hparams.get("moe_intermediate_size")
|
||||
if moe_intermediate_size is not None:
|
||||
|
|
@ -5079,16 +5061,6 @@ class KimiLinearModel(TextModel):
|
|||
if first_k_dense_replace is not None:
|
||||
self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace)
|
||||
|
||||
# Expert gating function (sigmoid for Kimi)
|
||||
moe_router_activation_func = self.hparams.get("moe_router_activation_func", "sigmoid")
|
||||
if moe_router_activation_func == "sigmoid":
|
||||
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
|
||||
elif moe_router_activation_func == "softmax":
|
||||
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
|
||||
else:
|
||||
logger.warning(f"Unknown moe_router_activation_func: {moe_router_activation_func}, defaulting to sigmoid")
|
||||
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
|
||||
|
||||
# Routed scaling factor (expert_weights_scale = 2.446 for Kimi)
|
||||
routed_scaling_factor = self.hparams.get("routed_scaling_factor")
|
||||
if routed_scaling_factor is not None:
|
||||
|
|
@ -5220,9 +5192,8 @@ class KimiLinearModel(TextModel):
|
|||
logger.info(f"A_log {name}: numpy {tuple(data_torch.shape)} -> ggml ne={list(reversed(data_torch.shape))}")
|
||||
|
||||
# Kimi specific bias
|
||||
if name.endswith("block_sparse_moe.gate.e_score_correction_bias"):
|
||||
new_name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_EXP_PROBS_B, bid)
|
||||
return [(new_name, data_torch)]
|
||||
if name.endswith("e_score_correction_bias"):
|
||||
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
|
||||
|
||||
# process the experts separately
|
||||
if name.find("block_sparse_moe.experts") != -1:
|
||||
|
|
@ -5257,18 +5228,6 @@ class KimiLinearModel(TextModel):
|
|||
logger.info(f"Returning {mapped_name}: shape after = {tuple(data_torch.shape)}")
|
||||
return [(mapped_name, data_torch)]
|
||||
|
||||
def get_vocab_base(self) -> tuple[list[str], list[int], str]:
|
||||
# This method is not used when set_vocab is overridden
|
||||
# But adding it for completeness in case it's called elsewhere
|
||||
logger.warning("get_vocab_base called, but set_vocab is already overridden")
|
||||
vocab_size = self.hparams.get("vocab_size", 100)
|
||||
tokens = [f"<token_{i}>" for i in range(vocab_size)]
|
||||
tokens[0] = "<unk>"
|
||||
tokens[1] = "<s>"
|
||||
tokens[2] = "</s>"
|
||||
toktypes = [gguf.TokenType.NORMAL] * vocab_size
|
||||
return tokens, toktypes, "gpt-2"
|
||||
|
||||
@ModelBase.register("InternLM2ForCausalLM")
|
||||
class InternLM2Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.INTERNLM2
|
||||
|
|
|
|||
|
|
@ -446,6 +446,7 @@ class MODEL_ARCH(IntEnum):
|
|||
RND1 = auto()
|
||||
PANGU_EMBED = auto()
|
||||
MISTRAL3 = auto()
|
||||
KIMI_LINEAR = auto() # Kimi-Linear (hybrid MLA+KDA)
|
||||
|
||||
|
||||
class VISION_PROJECTOR_TYPE(IntEnum):
|
||||
|
|
@ -535,6 +536,16 @@ class MODEL_TENSOR(IntEnum):
|
|||
SSM_NORM = auto()
|
||||
SSM_OUT = auto()
|
||||
SSM_BETA_ALPHA = auto() # qwen3next
|
||||
SSM_CONV1D_Q = auto() # Kimi Linear
|
||||
SSM_CONV1D_K = auto() # Kimi Linear
|
||||
SSM_CONV1D_V = auto() # Kimi Linear
|
||||
SSM_F_A = auto() # Kimi Linear
|
||||
SSM_F_B = auto() # Kimi Linear
|
||||
SSM_BETA = auto() # Kimi Linear
|
||||
SSM_A_LOG = auto() # Kimi Linear
|
||||
SSM_G_A = auto() # Kimi Linear
|
||||
SSM_G_B = auto() # Kimi Linear
|
||||
SSM_DT_B = auto() # Kimi Linear
|
||||
TIME_MIX_W0 = auto()
|
||||
TIME_MIX_W1 = auto()
|
||||
TIME_MIX_W2 = auto()
|
||||
|
|
@ -820,6 +831,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|||
MODEL_ARCH.RND1: "rnd1",
|
||||
MODEL_ARCH.PANGU_EMBED: "pangu-embedded",
|
||||
MODEL_ARCH.MISTRAL3: "mistral3",
|
||||
MODEL_ARCH.KIMI_LINEAR: "kimi-linear",
|
||||
}
|
||||
|
||||
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
|
||||
|
|
@ -907,6 +919,16 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||
MODEL_TENSOR.SSM_NORM: "blk.{bid}.ssm_norm",
|
||||
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
|
||||
MODEL_TENSOR.SSM_BETA_ALPHA: "blk.{bid}.ssm_ba",
|
||||
MODEL_TENSOR.SSM_CONV1D_Q: "blk.{bid}.ssm_conv1d_q", # Kimi Linear
|
||||
MODEL_TENSOR.SSM_CONV1D_K: "blk.{bid}.ssm_conv1d_k", # Kimi Linear
|
||||
MODEL_TENSOR.SSM_CONV1D_V: "blk.{bid}.ssm_conv1d_v", # Kimi Linear
|
||||
MODEL_TENSOR.SSM_F_A: "blk.{bid}.ssm_f_a", # Kimi Linear
|
||||
MODEL_TENSOR.SSM_F_B: "blk.{bid}.ssm_f_b", # Kimi Linear
|
||||
MODEL_TENSOR.SSM_BETA: "blk.{bid}.ssm_beta", # Kimi Linear
|
||||
MODEL_TENSOR.SSM_A_LOG: "blk.{bid}.ssm_a", # Kimi Linear
|
||||
MODEL_TENSOR.SSM_G_A: "blk.{bid}.ssm_g_a", # Kimi Linear
|
||||
MODEL_TENSOR.SSM_G_B: "blk.{bid}.ssm_g_b", # Kimi Linear
|
||||
MODEL_TENSOR.SSM_DT_B: "blk.{bid}.ssm_dt", # Kimi Linear
|
||||
MODEL_TENSOR.TIME_MIX_W0: "blk.{bid}.time_mix_w0",
|
||||
MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1",
|
||||
MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2",
|
||||
|
|
@ -3094,6 +3116,45 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
],
|
||||
MODEL_ARCH.KIMI_LINEAR: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.ATTN_Q_A,
|
||||
MODEL_TENSOR.ATTN_Q_B,
|
||||
MODEL_TENSOR.ATTN_KV_A_MQA,
|
||||
MODEL_TENSOR.ATTN_KV_B,
|
||||
MODEL_TENSOR.ATTN_Q_A_NORM,
|
||||
MODEL_TENSOR.ATTN_KV_A_NORM,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FFN_GATE_INP,
|
||||
MODEL_TENSOR.FFN_GATE_EXP,
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
MODEL_TENSOR.SSM_CONV1D_Q,
|
||||
MODEL_TENSOR.SSM_CONV1D_K,
|
||||
MODEL_TENSOR.SSM_CONV1D_V,
|
||||
MODEL_TENSOR.SSM_F_A,
|
||||
MODEL_TENSOR.SSM_F_B,
|
||||
MODEL_TENSOR.SSM_BETA,
|
||||
MODEL_TENSOR.SSM_A_LOG,
|
||||
MODEL_TENSOR.SSM_G_A,
|
||||
MODEL_TENSOR.SSM_G_B,
|
||||
MODEL_TENSOR.SSM_NORM,
|
||||
MODEL_TENSOR.SSM_DT_B,
|
||||
MODEL_TENSOR.FFN_EXP_PROBS_B,
|
||||
MODEL_TENSOR.FFN_GATE_SHEXP,
|
||||
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||
MODEL_TENSOR.FFN_UP_SHEXP,
|
||||
],
|
||||
# TODO
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -389,6 +389,7 @@ class TensorNameMap:
|
|||
"model.layers.{bid}.mlp.expert_bias", # afmoe
|
||||
"model.layers.{bid}.feed_forward.expert_bias", # lfm2moe
|
||||
"model.layers.{bid}.block_sparse_moe.e_score_correction", # minimax-m2
|
||||
"model.layers.{bid}.block_sparse_moe.gate.e_score_correction", # kimi
|
||||
),
|
||||
|
||||
# Feed-forward up
|
||||
|
|
@ -450,6 +451,7 @@ class TensorNameMap:
|
|||
"model.layers.{bid}.feed_forward.shared_expert.up_proj", # llama4
|
||||
"model.layers.{bid}.feed_forward.down_proj",
|
||||
"model.layers.{bid}.mlp.shared_mlp.up_proj", # hunyuan
|
||||
"model.layers.{bid}.block_sparse_moe.shared_experts.up_proj", # kimi
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_UP_CHEXP: (
|
||||
|
|
@ -500,6 +502,7 @@ class TensorNameMap:
|
|||
|
||||
MODEL_TENSOR.FFN_GATE_CHEXP: (
|
||||
"model.layers.{bid}.mlp.chunk_experts.gate_proj", # grovemoe
|
||||
"model.layers.{bid}.block_sparse_moe.shared_experts.gate_proj", # kimi
|
||||
),
|
||||
|
||||
# Feed-forward down
|
||||
|
|
@ -557,6 +560,7 @@ class TensorNameMap:
|
|||
"model.layers.{bid}.feed_forward.shared_expert.down_proj", # llama4
|
||||
"model.layers.{bid}.shared_mlp.output_linear", # granitemoe
|
||||
"model.layers.{bid}.mlp.shared_mlp.down_proj", # hunyuan
|
||||
"model.layers.{bid}.block_sparse_moe.shared_experts.down_proj", # kimi
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_DOWN_CHEXP: (
|
||||
|
|
@ -738,6 +742,7 @@ class TensorNameMap:
|
|||
"model.layers.{bid}.mamba.norm", # falcon-h1 granite-hybrid
|
||||
"model.layers.{bid}.linear_attn.norm", # qwen3next
|
||||
"backbone.layers.{bid}.mixer.norm", # mamba2
|
||||
"model.layers.{bid}.self_attn.o_norm", # kimi
|
||||
),
|
||||
|
||||
MODEL_TENSOR.SSM_OUT: (
|
||||
|
|
@ -1569,6 +1574,38 @@ class TensorNameMap:
|
|||
"audio.multi_modal_projector.ln_mid", # ultravox
|
||||
),
|
||||
|
||||
# Kimi Linear KDA (using SSM_ prefix for consistency)
|
||||
MODEL_TENSOR.SSM_CONV1D_Q: (
|
||||
"model.layers.{bid}.self_attn.q_conv1d",
|
||||
),
|
||||
MODEL_TENSOR.SSM_CONV1D_K: (
|
||||
"model.layers.{bid}.self_attn.k_conv1d",
|
||||
),
|
||||
MODEL_TENSOR.SSM_CONV1D_V: (
|
||||
"model.layers.{bid}.self_attn.v_conv1d",
|
||||
),
|
||||
MODEL_TENSOR.SSM_F_A: (
|
||||
"model.layers.{bid}.self_attn.f_a_proj",
|
||||
),
|
||||
MODEL_TENSOR.SSM_F_B: (
|
||||
"model.layers.{bid}.self_attn.f_b_proj",
|
||||
),
|
||||
MODEL_TENSOR.SSM_BETA: (
|
||||
"model.layers.{bid}.self_attn.b_proj",
|
||||
),
|
||||
MODEL_TENSOR.SSM_A_LOG: (
|
||||
"model.layers.{bid}.self_attn.A_log",
|
||||
),
|
||||
MODEL_TENSOR.SSM_G_A: (
|
||||
"model.layers.{bid}.self_attn.g_a_proj",
|
||||
),
|
||||
MODEL_TENSOR.SSM_G_B: (
|
||||
"model.layers.{bid}.self_attn.g_b_proj",
|
||||
),
|
||||
MODEL_TENSOR.SSM_DT_B: (
|
||||
"model.layers.{bid}.self_attn.dt_bias",
|
||||
),
|
||||
|
||||
# NextN/MTP tensors for GLM4_MOE
|
||||
MODEL_TENSOR.NEXTN_EH_PROJ: (
|
||||
"model.layers.{bid}.eh_proj",
|
||||
|
|
|
|||
|
|
@ -82,6 +82,7 @@ add_library(llama
|
|||
models/internlm2.cpp
|
||||
models/jais.cpp
|
||||
models/jamba.cpp
|
||||
models/kimi-linear.cpp
|
||||
models/lfm2.cpp
|
||||
models/llada-moe.cpp
|
||||
models/llada.cpp
|
||||
|
|
|
|||
Loading…
Reference in New Issue