From 2c8cd844d0c4d8a1a64403dab4f0017acd23ba06 Mon Sep 17 00:00:00 2001
From: Yee Man Chan <ymchan@gmail.com>
Date: Sun, 1 Feb 2026 08:42:01 +0800
Subject: [PATCH] added new names for n_experts, n_experts_used and score_func
 in TextModel and removed their code in KimiLinear in convert_hf_to_gguf.py.
 Removed unnecessary ggml_cont and GGML_ASSERT in kimi-linear.cpp

---
 convert_hf_to_gguf.py      | 34 +++++++++-------------------------
 src/models/kimi-linear.cpp | 13 +------------
 2 files changed, 10 insertions(+), 37 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index a1b4401198..08e4a12e45 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -907,10 +907,10 @@ class TextModel(ModelBase):
         if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
             self.gguf_writer.add_layer_norm_eps(f_norm_eps)
             logger.info(f"gguf: layer norm epsilon = {f_norm_eps}")
-        if (n_experts := self.hparams.get("num_local_experts")) is not None:
+        if (n_experts := self.find_hparam(["num_local_experts", "num_experts"], optional=True)) is not None:
             self.gguf_writer.add_expert_count(n_experts)
             logger.info(f"gguf: expert count = {n_experts}")
-        if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
+        if (n_experts_used := self.find_hparam(["num_experts_per_tok", "num_experts_per_token"], optional=True)) is not None:
             self.gguf_writer.add_expert_used_count(n_experts_used)
             logger.info(f"gguf: experts used count = {n_experts_used}")
         if (n_expert_groups := self.hparams.get("n_group")) is not None:
@@ -920,7 +920,7 @@ class TextModel(ModelBase):
             self.gguf_writer.add_expert_group_used_count(n_group_used)
             logger.info(f"gguf: expert groups used count = {n_group_used}")
 
-        if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func"], optional=True)) is not None:
+        if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func", "moe_router_activation_func"], optional=True)) is not None:
             if score_func == "sigmoid":
                 self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
             elif score_func == "softmax":
@@ -5086,14 +5086,6 @@ class KimiLinearModel(TextModel):
         super().set_gguf_parameters()
         self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
 
-        if (score_func := self.find_hparam(["moe_router_activation_func"], optional=True)) is not None:
-            if score_func == "sigmoid":
-                self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
-            elif score_func == "softmax":
-                self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
-            else:
-                raise ValueError(f"Unsupported expert score gating function value: {score_func}")
-
         # KDA & MLA params
         # Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv
         linear_attn_config = self.find_hparam(["linear_attn_config"], optional=False)
@@ -5152,11 +5144,6 @@ class KimiLinearModel(TextModel):
             head_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
             self.gguf_writer.add_rope_dimension_count(head_dim)
 
-        if (n_experts := self.find_hparam(["num_experts"], optional=False)) is not None:
-            self.gguf_writer.add_expert_count(n_experts)
-        if (n_experts_used := self.find_hparam(["num_experts_per_token"], optional=False)) is not None:
-            self.gguf_writer.add_expert_used_count(n_experts_used)
-
         # moe_intermediate_size (1024 for Kimi)
         if (moe_intermediate_size := self.find_hparam(["moe_intermediate_size"], optional=False)) is not None:
             self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
@@ -5227,7 +5214,6 @@ class KimiLinearModel(TextModel):
 
             if len(self._experts[bid]) >= n_experts * 3:
                 # merge the experts into a single 3d tensor
-                tensors = []
                 # w1: gate, w2: down, w3: up
                 for wid, tname in [("w1", gguf.MODEL_TENSOR.FFN_GATE_EXP),
                                    ("w2", gguf.MODEL_TENSOR.FFN_DOWN_EXP),
@@ -5237,12 +5223,10 @@ class KimiLinearModel(TextModel):
                         ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight"
                         datas.append(self._experts[bid][ename])
                         del self._experts[bid][ename]
-
                     data_torch = torch.stack(datas, dim=0)
                     new_name = self.format_tensor_name(tname, bid)
-                    tensors.append((new_name, data_torch))
-                return tensors
-            return []
+                    yield from super().modify_tensors(data_torch, new_name, bid)
+            return
 
         # note: MLA with the absorption optimization, needs these two split and k_b_proj transposed
         if name.endswith("kv_b_proj.weight"):
@@ -5256,11 +5240,11 @@ class KimiLinearModel(TextModel):
             kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1])
             k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1)
             k_b = k_b.transpose(1, 2)
-            return [(self.map_tensor_name(name_kb), k_b), (self.map_tensor_name(name_vb), v_b)]
+            yield from super().modify_tensors(k_b, name_kb, bid)
+            yield from super().modify_tensors(v_b, name_vb, bid)
+            return
 
-        mapped_name = self.map_tensor_name(name)
-        logger.info(f"Returning {mapped_name}: shape after = {tuple(data_torch.shape)}")
-        return [(mapped_name, data_torch)]
+        yield from super().modify_tensors(data_torch, name, bid)
 
 
 @ModelBase.register("InternLM2ForCausalLM")
diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp
index a7e5482008..83349cc9ec 100644
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@@ -34,7 +34,7 @@ static ggml_tensor * causal_conv1d(ggml_cgraph * gf, ggml_context * ctx0, ggml_t
     ggml_tensor * x_3d = ggml_reshape_3d(ctx0, x_proj, d_inner, n_seq_tokens, n_seqs);
 
     // Concat Q conv state and current input: {d_conv-1 + n_seq_tokens, d_inner, n_seqs}
-    ggml_tensor * conv_x = ggml_cont(ctx0, ggml_concat(ctx0, conv_state_x, ggml_transpose(ctx0, x_3d), 0));
+    ggml_tensor * conv_x = ggml_concat(ctx0, conv_state_x, ggml_transpose(ctx0, x_3d), 0);
 
     // Save last (d_conv-1) columns back to Q conv state
     ggml_tensor * last_conv_x = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs,
@@ -289,8 +289,6 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
                     ggml_row_size(kv->type, kv_per_head),
                     ggml_row_size(kv->type, kv_per_head * n_head),
                     ggml_row_size(kv->type, n_embd_head_qk_nope));
-                k_nope = ggml_cont(ctx0, k_nope);
-                Vcur = ggml_cont(ctx0, Vcur);
                 cb(Vcur, "mla_V", il);
 
                 // Concatenate k_nope + k_pe (broadcast k_pe to all heads)
@@ -403,11 +401,6 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_kimi_linear::build_kda_chunkin
         ggml_tensor * identity,
         ggml_tensor * diag_mask,
         int           il) {
-    GGML_ASSERT(ggml_is_contiguous(q));
-    GGML_ASSERT(ggml_is_contiguous(k));
-    GGML_ASSERT(ggml_is_contiguous(v));
-    GGML_ASSERT(ggml_is_contiguous(gk));
-    GGML_ASSERT(ggml_is_contiguous(beta));
     GGML_ASSERT(ggml_is_contiguous(state));
 
     const int64_t S_k      = q->ne[0];
@@ -694,12 +687,8 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_kimi_linear::build_kda_autoreg
     ggml_tensor * beta,
     ggml_tensor * state,
     int il) {
-    GGML_ASSERT(ggml_is_contiguous(q));
-    GGML_ASSERT(ggml_is_contiguous(k));
     GGML_ASSERT(ggml_is_contiguous(v));
     GGML_ASSERT(ggml_is_contiguous(gk));
-    GGML_ASSERT(ggml_is_contiguous(beta));
-    GGML_ASSERT(ggml_is_contiguous(state));
 
     const int64_t S_k      = q->ne[0];
     const int64_t H_k      = q->ne[1];