From 2c8cd844d0c4d8a1a64403dab4f0017acd23ba06 Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Sun, 1 Feb 2026 08:42:01 +0800 Subject: [PATCH] added new names for n_experts, n_experts_used and score_func in TextModel and removed their code in KimiLinear in convert_hf_to_gguf.py. Removed unnecessary ggml_cont and GGML_ASSERT in kimi-linear.cpp --- convert_hf_to_gguf.py | 34 +++++++++------------------------- src/models/kimi-linear.cpp | 13 +------------ 2 files changed, 10 insertions(+), 37 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index a1b4401198..08e4a12e45 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -907,10 +907,10 @@ class TextModel(ModelBase): if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None: self.gguf_writer.add_layer_norm_eps(f_norm_eps) logger.info(f"gguf: layer norm epsilon = {f_norm_eps}") - if (n_experts := self.hparams.get("num_local_experts")) is not None: + if (n_experts := self.find_hparam(["num_local_experts", "num_experts"], optional=True)) is not None: self.gguf_writer.add_expert_count(n_experts) logger.info(f"gguf: expert count = {n_experts}") - if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: + if (n_experts_used := self.find_hparam(["num_experts_per_tok", "num_experts_per_token"], optional=True)) is not None: self.gguf_writer.add_expert_used_count(n_experts_used) logger.info(f"gguf: experts used count = {n_experts_used}") if (n_expert_groups := self.hparams.get("n_group")) is not None: @@ -920,7 +920,7 @@ class TextModel(ModelBase): self.gguf_writer.add_expert_group_used_count(n_group_used) logger.info(f"gguf: expert groups used count = {n_group_used}") - if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func"], optional=True)) is not None: + if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func", "moe_router_activation_func"], optional=True)) is not None: if score_func == "sigmoid": self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) elif score_func == "softmax": @@ -5086,14 +5086,6 @@ class KimiLinearModel(TextModel): super().set_gguf_parameters() self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) - if (score_func := self.find_hparam(["moe_router_activation_func"], optional=True)) is not None: - if score_func == "sigmoid": - self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) - elif score_func == "softmax": - self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX) - else: - raise ValueError(f"Unsupported expert score gating function value: {score_func}") - # KDA & MLA params # Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv linear_attn_config = self.find_hparam(["linear_attn_config"], optional=False) @@ -5152,11 +5144,6 @@ class KimiLinearModel(TextModel): head_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(head_dim) - if (n_experts := self.find_hparam(["num_experts"], optional=False)) is not None: - self.gguf_writer.add_expert_count(n_experts) - if (n_experts_used := self.find_hparam(["num_experts_per_token"], optional=False)) is not None: - self.gguf_writer.add_expert_used_count(n_experts_used) - # moe_intermediate_size (1024 for Kimi) if (moe_intermediate_size := self.find_hparam(["moe_intermediate_size"], optional=False)) is not None: self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) @@ -5227,7 +5214,6 @@ class KimiLinearModel(TextModel): if len(self._experts[bid]) >= n_experts * 3: # merge the experts into a single 3d tensor - tensors = [] # w1: gate, w2: down, w3: up for wid, tname in [("w1", gguf.MODEL_TENSOR.FFN_GATE_EXP), ("w2", gguf.MODEL_TENSOR.FFN_DOWN_EXP), @@ -5237,12 +5223,10 @@ class KimiLinearModel(TextModel): ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight" datas.append(self._experts[bid][ename]) del self._experts[bid][ename] - data_torch = torch.stack(datas, dim=0) new_name = self.format_tensor_name(tname, bid) - tensors.append((new_name, data_torch)) - return tensors - return [] + yield from super().modify_tensors(data_torch, new_name, bid) + return # note: MLA with the absorption optimization, needs these two split and k_b_proj transposed if name.endswith("kv_b_proj.weight"): @@ -5256,11 +5240,11 @@ class KimiLinearModel(TextModel): kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1]) k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1) k_b = k_b.transpose(1, 2) - return [(self.map_tensor_name(name_kb), k_b), (self.map_tensor_name(name_vb), v_b)] + yield from super().modify_tensors(k_b, name_kb, bid) + yield from super().modify_tensors(v_b, name_vb, bid) + return - mapped_name = self.map_tensor_name(name) - logger.info(f"Returning {mapped_name}: shape after = {tuple(data_torch.shape)}") - return [(mapped_name, data_torch)] + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("InternLM2ForCausalLM") diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index a7e5482008..83349cc9ec 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -34,7 +34,7 @@ static ggml_tensor * causal_conv1d(ggml_cgraph * gf, ggml_context * ctx0, ggml_t ggml_tensor * x_3d = ggml_reshape_3d(ctx0, x_proj, d_inner, n_seq_tokens, n_seqs); // Concat Q conv state and current input: {d_conv-1 + n_seq_tokens, d_inner, n_seqs} - ggml_tensor * conv_x = ggml_cont(ctx0, ggml_concat(ctx0, conv_state_x, ggml_transpose(ctx0, x_3d), 0)); + ggml_tensor * conv_x = ggml_concat(ctx0, conv_state_x, ggml_transpose(ctx0, x_3d), 0); // Save last (d_conv-1) columns back to Q conv state ggml_tensor * last_conv_x = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs, @@ -289,8 +289,6 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll ggml_row_size(kv->type, kv_per_head), ggml_row_size(kv->type, kv_per_head * n_head), ggml_row_size(kv->type, n_embd_head_qk_nope)); - k_nope = ggml_cont(ctx0, k_nope); - Vcur = ggml_cont(ctx0, Vcur); cb(Vcur, "mla_V", il); // Concatenate k_nope + k_pe (broadcast k_pe to all heads) @@ -403,11 +401,6 @@ std::pair llm_build_kimi_linear::build_kda_chunkin ggml_tensor * identity, ggml_tensor * diag_mask, int il) { - GGML_ASSERT(ggml_is_contiguous(q)); - GGML_ASSERT(ggml_is_contiguous(k)); - GGML_ASSERT(ggml_is_contiguous(v)); - GGML_ASSERT(ggml_is_contiguous(gk)); - GGML_ASSERT(ggml_is_contiguous(beta)); GGML_ASSERT(ggml_is_contiguous(state)); const int64_t S_k = q->ne[0]; @@ -694,12 +687,8 @@ std::pair llm_build_kimi_linear::build_kda_autoreg ggml_tensor * beta, ggml_tensor * state, int il) { - GGML_ASSERT(ggml_is_contiguous(q)); - GGML_ASSERT(ggml_is_contiguous(k)); GGML_ASSERT(ggml_is_contiguous(v)); GGML_ASSERT(ggml_is_contiguous(gk)); - GGML_ASSERT(ggml_is_contiguous(beta)); - GGML_ASSERT(ggml_is_contiguous(state)); const int64_t S_k = q->ne[0]; const int64_t H_k = q->ne[1];