From dce064c0a3882b20d1b2a78c01cc181afa461e67 Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Sat, 10 Jan 2026 22:08:38 +0800 Subject: [PATCH] fixed typo and split wkv_b into wk_b and wv_b --- convert_hf_to_gguf.py | 25 +++++++++++++++++++++++-- gguf-py/gguf/constants.py | 2 ++ gguf-py/gguf/tensor_mapping.py | 4 ++-- 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 59ee156dd9..321930d7e6 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -5275,7 +5275,8 @@ class KimiLinearModel(TextModel): # Kimi specific bias if name.endswith("e_score_correction_bias"): - name = name.replace("e_score_correction_bias", "e_score_correction.bias") + new_name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_EXP_PROBS_B, bid) + return [(new_name, data_torch)] # process the experts separately if name.find("block_sparse_moe.experts") != -1: @@ -5305,7 +5306,27 @@ class KimiLinearModel(TextModel): tensors.append((new_name, data_torch)) return tensors return [] - + + # note: MLA with the absorption optimization, needs these two split and k_b_proj transposed + if name.endswith("kv_b_proj.weight"): + name_kb = name.replace("kv_b_proj", "k_b_proj") + name_vb = name.replace("kv_b_proj", "v_b_proj") + + n_head_kv = self.hparams["num_key_value_heads"] + v_head_dim = self.hparams["v_head_dim"] + qk_nope_head_dim = self.hparams["qk_nope_head_dim"] + + assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim) + + kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1]) + k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1) + k_b = k_b.transpose(1, 2) + + return [ + (self.map_tensor_name(name_kb), k_b), + (self.map_tensor_name(name_vb), v_b) + ] + mapped_name = self.map_tensor_name(name) logger.info(f"Returning {mapped_name}: shape after = {tuple(data_torch.shape)}") return [(mapped_name, data_torch)] diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 372489ca44..8d2b54d7d5 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -3317,6 +3317,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.ATTN_Q_B, MODEL_TENSOR.ATTN_KV_A_MQA, MODEL_TENSOR.ATTN_KV_B, + MODEL_TENSOR.ATTN_K_B, + MODEL_TENSOR.ATTN_V_B, MODEL_TENSOR.ATTN_Q_A_NORM, MODEL_TENSOR.ATTN_KV_A_NORM, MODEL_TENSOR.FFN_NORM, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index b0d4fb1cb1..486f6a5b1d 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -403,7 +403,7 @@ class TensorNameMap: "model.layers.{bid}.feed_forward.expert_bias", # lfm2moe "model.layers.{bid}.block_sparse_moe.e_score_correction", # minimax-m2 "backbone.layers.{bid}.mixer.gate.e_score_correction" # nemotron-h-moe - "model.layers.{bid}.block_sparse_moe.gate.e_score_correction", # kimi + "model.layers.{bid}.block_sparse_moe.gate.e_score_correction_bias", # kimi ), # Feed-forward up @@ -812,7 +812,7 @@ class TensorNameMap: ), MODEL_TENSOR.SSM_DT_B: ( "model.layers.{bid}.self_attn.dt_bias", - + ), MODEL_TENSOR.TIME_MIX_W0: ( "model.layers.{bid}.attention.w0", # rwkv7 ),