added new names for n_experts, n_experts_used and score_func in TextModel and removed their code in KimiLinear in convert_hf_to_gguf.py. Removed unnecessary ggml_cont and GGML_ASSERT in kimi-linear.cpp

2026-02-01 08:42:01 +08:00 · 2026-02-01 08:42:01 +08:00 · 2c8cd844d0
parent 2a62df613f
commit 2c8cd844d0
2 changed files with 10 additions and 37 deletions
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -907,10 +907,10 @@ class TextModel(ModelBase):
        if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
            self.gguf_writer.add_layer_norm_eps(f_norm_eps)
            logger.info(f"gguf: layer norm epsilon = {f_norm_eps}")
-        if (n_experts := self.hparams.get("num_local_experts")) is not None:
+        if (n_experts := self.find_hparam(["num_local_experts", "num_experts"], optional=True)) is not None:
            self.gguf_writer.add_expert_count(n_experts)
            logger.info(f"gguf: expert count = {n_experts}")
-        if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
+        if (n_experts_used := self.find_hparam(["num_experts_per_tok", "num_experts_per_token"], optional=True)) is not None:
            self.gguf_writer.add_expert_used_count(n_experts_used)
            logger.info(f"gguf: experts used count = {n_experts_used}")
        if (n_expert_groups := self.hparams.get("n_group")) is not None:
@ -920,7 +920,7 @@ class TextModel(ModelBase):
            self.gguf_writer.add_expert_group_used_count(n_group_used)
            logger.info(f"gguf: expert groups used count = {n_group_used}")

-        if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func"], optional=True)) is not None:
+        if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func", "moe_router_activation_func"], optional=True)) is not None:
            if score_func == "sigmoid":
                self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
            elif score_func == "softmax":
@ -5086,14 +5086,6 @@ class KimiLinearModel(TextModel):
        super().set_gguf_parameters()
        self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])

-        if (score_func := self.find_hparam(["moe_router_activation_func"], optional=True)) is not None:
-            if score_func == "sigmoid":
-                self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
-            elif score_func == "softmax":
-                self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
-            else:
-                raise ValueError(f"Unsupported expert score gating function value: {score_func}")
-
        # KDA & MLA params
        # Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv
        linear_attn_config = self.find_hparam(["linear_attn_config"], optional=False)
@ -5152,11 +5144,6 @@ class KimiLinearModel(TextModel):
            head_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
            self.gguf_writer.add_rope_dimension_count(head_dim)

-        if (n_experts := self.find_hparam(["num_experts"], optional=False)) is not None:
-            self.gguf_writer.add_expert_count(n_experts)
-        if (n_experts_used := self.find_hparam(["num_experts_per_token"], optional=False)) is not None:
-            self.gguf_writer.add_expert_used_count(n_experts_used)
-
        # moe_intermediate_size (1024 for Kimi)
        if (moe_intermediate_size := self.find_hparam(["moe_intermediate_size"], optional=False)) is not None:
            self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
@ -5227,7 +5214,6 @@ class KimiLinearModel(TextModel):

            if len(self._experts[bid]) >= n_experts * 3:
                # merge the experts into a single 3d tensor
-                tensors = []
                # w1: gate, w2: down, w3: up
                for wid, tname in [("w1", gguf.MODEL_TENSOR.FFN_GATE_EXP),
                                   ("w2", gguf.MODEL_TENSOR.FFN_DOWN_EXP),
@ -5237,12 +5223,10 @@ class KimiLinearModel(TextModel):
                        ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight"
                        datas.append(self._experts[bid][ename])
                        del self._experts[bid][ename]
-
                    data_torch = torch.stack(datas, dim=0)
                    new_name = self.format_tensor_name(tname, bid)
-                    tensors.append((new_name, data_torch))
-                return tensors
-            return []
+                    yield from super().modify_tensors(data_torch, new_name, bid)
+            return

        # note: MLA with the absorption optimization, needs these two split and k_b_proj transposed
        if name.endswith("kv_b_proj.weight"):
@ -5256,11 +5240,11 @@ class KimiLinearModel(TextModel):
            kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1])
            k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1)
            k_b = k_b.transpose(1, 2)
-            return [(self.map_tensor_name(name_kb), k_b), (self.map_tensor_name(name_vb), v_b)]
+            yield from super().modify_tensors(k_b, name_kb, bid)
+            yield from super().modify_tensors(v_b, name_vb, bid)
+            return

-        mapped_name = self.map_tensor_name(name)
-        logger.info(f"Returning {mapped_name}: shape after = {tuple(data_torch.shape)}")
-        return [(mapped_name, data_torch)]
+        yield from super().modify_tensors(data_torch, name, bid)


@ModelBase.register("InternLM2ForCausalLM")
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@ -34,7 +34,7 @@ static ggml_tensor * causal_conv1d(ggml_cgraph * gf, ggml_context * ctx0, ggml_t
    ggml_tensor * x_3d = ggml_reshape_3d(ctx0, x_proj, d_inner, n_seq_tokens, n_seqs);

    // Concat Q conv state and current input: {d_conv-1 + n_seq_tokens, d_inner, n_seqs}
-    ggml_tensor * conv_x = ggml_cont(ctx0, ggml_concat(ctx0, conv_state_x, ggml_transpose(ctx0, x_3d), 0));
+    ggml_tensor * conv_x = ggml_concat(ctx0, conv_state_x, ggml_transpose(ctx0, x_3d), 0);

    // Save last (d_conv-1) columns back to Q conv state
    ggml_tensor * last_conv_x = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs,
@ -289,8 +289,6 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
                    ggml_row_size(kv->type, kv_per_head),
                    ggml_row_size(kv->type, kv_per_head * n_head),
                    ggml_row_size(kv->type, n_embd_head_qk_nope));
-                k_nope = ggml_cont(ctx0, k_nope);
-                Vcur = ggml_cont(ctx0, Vcur);
                cb(Vcur, "mla_V", il);

                // Concatenate k_nope + k_pe (broadcast k_pe to all heads)
@ -403,11 +401,6 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_kimi_linear::build_kda_chunkin
        ggml_tensor * identity,
        ggml_tensor * diag_mask,
        int           il) {
-    GGML_ASSERT(ggml_is_contiguous(q));
-    GGML_ASSERT(ggml_is_contiguous(k));
-    GGML_ASSERT(ggml_is_contiguous(v));
-    GGML_ASSERT(ggml_is_contiguous(gk));
-    GGML_ASSERT(ggml_is_contiguous(beta));
    GGML_ASSERT(ggml_is_contiguous(state));

    const int64_t S_k      = q->ne[0];
@ -694,12 +687,8 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_kimi_linear::build_kda_autoreg
    ggml_tensor * beta,
    ggml_tensor * state,
    int il) {
-    GGML_ASSERT(ggml_is_contiguous(q));
-    GGML_ASSERT(ggml_is_contiguous(k));
    GGML_ASSERT(ggml_is_contiguous(v));
    GGML_ASSERT(ggml_is_contiguous(gk));
-    GGML_ASSERT(ggml_is_contiguous(beta));
-    GGML_ASSERT(ggml_is_contiguous(state));

    const int64_t S_k      = q->ne[0];
    const int64_t H_k      = q->ne[1];