added new names for n_experts, n_experts_used and score_func in TextModel and removed their code in KimiLinear in convert_hf_to_gguf.py. Removed unnecessary ggml_cont and GGML_ASSERT in kimi-linear.cpp

This commit is contained in:
Yee Man Chan 2026-02-01 08:42:01 +08:00
parent 2a62df613f
commit 2c8cd844d0
2 changed files with 10 additions and 37 deletions

View File

@ -907,10 +907,10 @@ class TextModel(ModelBase):
if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
self.gguf_writer.add_layer_norm_eps(f_norm_eps)
logger.info(f"gguf: layer norm epsilon = {f_norm_eps}")
if (n_experts := self.hparams.get("num_local_experts")) is not None:
if (n_experts := self.find_hparam(["num_local_experts", "num_experts"], optional=True)) is not None:
self.gguf_writer.add_expert_count(n_experts)
logger.info(f"gguf: expert count = {n_experts}")
if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
if (n_experts_used := self.find_hparam(["num_experts_per_tok", "num_experts_per_token"], optional=True)) is not None:
self.gguf_writer.add_expert_used_count(n_experts_used)
logger.info(f"gguf: experts used count = {n_experts_used}")
if (n_expert_groups := self.hparams.get("n_group")) is not None:
@ -920,7 +920,7 @@ class TextModel(ModelBase):
self.gguf_writer.add_expert_group_used_count(n_group_used)
logger.info(f"gguf: expert groups used count = {n_group_used}")
if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func"], optional=True)) is not None:
if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func", "moe_router_activation_func"], optional=True)) is not None:
if score_func == "sigmoid":
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
elif score_func == "softmax":
@ -5086,14 +5086,6 @@ class KimiLinearModel(TextModel):
super().set_gguf_parameters()
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
if (score_func := self.find_hparam(["moe_router_activation_func"], optional=True)) is not None:
if score_func == "sigmoid":
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
elif score_func == "softmax":
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
else:
raise ValueError(f"Unsupported expert score gating function value: {score_func}")
# KDA & MLA params
# Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv
linear_attn_config = self.find_hparam(["linear_attn_config"], optional=False)
@ -5152,11 +5144,6 @@ class KimiLinearModel(TextModel):
head_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
self.gguf_writer.add_rope_dimension_count(head_dim)
if (n_experts := self.find_hparam(["num_experts"], optional=False)) is not None:
self.gguf_writer.add_expert_count(n_experts)
if (n_experts_used := self.find_hparam(["num_experts_per_token"], optional=False)) is not None:
self.gguf_writer.add_expert_used_count(n_experts_used)
# moe_intermediate_size (1024 for Kimi)
if (moe_intermediate_size := self.find_hparam(["moe_intermediate_size"], optional=False)) is not None:
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
@ -5227,7 +5214,6 @@ class KimiLinearModel(TextModel):
if len(self._experts[bid]) >= n_experts * 3:
# merge the experts into a single 3d tensor
tensors = []
# w1: gate, w2: down, w3: up
for wid, tname in [("w1", gguf.MODEL_TENSOR.FFN_GATE_EXP),
("w2", gguf.MODEL_TENSOR.FFN_DOWN_EXP),
@ -5237,12 +5223,10 @@ class KimiLinearModel(TextModel):
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight"
datas.append(self._experts[bid][ename])
del self._experts[bid][ename]
data_torch = torch.stack(datas, dim=0)
new_name = self.format_tensor_name(tname, bid)
tensors.append((new_name, data_torch))
return tensors
return []
yield from super().modify_tensors(data_torch, new_name, bid)
return
# note: MLA with the absorption optimization, needs these two split and k_b_proj transposed
if name.endswith("kv_b_proj.weight"):
@ -5256,11 +5240,11 @@ class KimiLinearModel(TextModel):
kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1])
k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1)
k_b = k_b.transpose(1, 2)
return [(self.map_tensor_name(name_kb), k_b), (self.map_tensor_name(name_vb), v_b)]
yield from super().modify_tensors(k_b, name_kb, bid)
yield from super().modify_tensors(v_b, name_vb, bid)
return
mapped_name = self.map_tensor_name(name)
logger.info(f"Returning {mapped_name}: shape after = {tuple(data_torch.shape)}")
return [(mapped_name, data_torch)]
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("InternLM2ForCausalLM")

View File

@ -34,7 +34,7 @@ static ggml_tensor * causal_conv1d(ggml_cgraph * gf, ggml_context * ctx0, ggml_t
ggml_tensor * x_3d = ggml_reshape_3d(ctx0, x_proj, d_inner, n_seq_tokens, n_seqs);
// Concat Q conv state and current input: {d_conv-1 + n_seq_tokens, d_inner, n_seqs}
ggml_tensor * conv_x = ggml_cont(ctx0, ggml_concat(ctx0, conv_state_x, ggml_transpose(ctx0, x_3d), 0));
ggml_tensor * conv_x = ggml_concat(ctx0, conv_state_x, ggml_transpose(ctx0, x_3d), 0);
// Save last (d_conv-1) columns back to Q conv state
ggml_tensor * last_conv_x = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs,
@ -289,8 +289,6 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
ggml_row_size(kv->type, kv_per_head),
ggml_row_size(kv->type, kv_per_head * n_head),
ggml_row_size(kv->type, n_embd_head_qk_nope));
k_nope = ggml_cont(ctx0, k_nope);
Vcur = ggml_cont(ctx0, Vcur);
cb(Vcur, "mla_V", il);
// Concatenate k_nope + k_pe (broadcast k_pe to all heads)
@ -403,11 +401,6 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_kimi_linear::build_kda_chunkin
ggml_tensor * identity,
ggml_tensor * diag_mask,
int il) {
GGML_ASSERT(ggml_is_contiguous(q));
GGML_ASSERT(ggml_is_contiguous(k));
GGML_ASSERT(ggml_is_contiguous(v));
GGML_ASSERT(ggml_is_contiguous(gk));
GGML_ASSERT(ggml_is_contiguous(beta));
GGML_ASSERT(ggml_is_contiguous(state));
const int64_t S_k = q->ne[0];
@ -694,12 +687,8 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_kimi_linear::build_kda_autoreg
ggml_tensor * beta,
ggml_tensor * state,
int il) {
GGML_ASSERT(ggml_is_contiguous(q));
GGML_ASSERT(ggml_is_contiguous(k));
GGML_ASSERT(ggml_is_contiguous(v));
GGML_ASSERT(ggml_is_contiguous(gk));
GGML_ASSERT(ggml_is_contiguous(beta));
GGML_ASSERT(ggml_is_contiguous(state));
const int64_t S_k = q->ne[0];
const int64_t H_k = q->ne[1];