From 44bc40fdd9af155f519b3caf4cca111ff6e52bd9 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 29 Jan 2026 16:06:06 +0100 Subject: [PATCH] wip --- convert_hf_to_gguf.py | 48 +++++++++++++++++++++++++++++++++++++++ gguf-py/gguf/constants.py | 32 ++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index a391717e32..764c458be7 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -7513,6 +7513,7 @@ class DeepseekV2Model(TextModel): self._experts[bid][name] = data_torch if len(self._experts[bid]) >= n_experts * 3: + print("->>>> Merging experts for block", bid, '\n'.join(self._experts[bid].keys())) # merge the experts into a single 3d tensor for w_name in ["down_proj", "gate_proj", "up_proj"]: datas: list[Tensor] = [] @@ -10914,6 +10915,53 @@ class SolarOpenModel(Glm4MoeModel): special_vocab.add_to_gguf(self.gguf_writer) +@ModelBase.register("LongcatFlashForCausalLM") +class LongcatFlashModel(DeepseekV2Model): + model_arch = gguf.MODEL_ARCH.LONGCAT_FLASH + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # the model use double block, we need to adjust block count + self.block_count = self.hparams["num_layers"] * 2 + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + # compat with deepseek2 base class hparam + self.hparams["num_hidden_layers"] = self.block_count + self.hparams["num_key_value_heads"] = self.hparams["num_attention_heads"] + self.hparams["intermediate_size"] = self.hparams["ffn_hidden_size"] + self.hparams["moe_intermediate_size"] = self.hparams["expert_ffn_hidden_size"] + + def modify_tensors(self, data_torch, name, bid): + if bid is not None: + bid = bid * 2 # double block id + + # Rename rules examples: + # model.layers.1.input_layernorm.0.weight --> model.layers.1.input_layernorm.weight + # model.layers.1.input_layernorm.1.weight --> model.layers.2.input_layernorm.weight + # model.layers.1.mlp.experts.0 --> model.layers.2.mlp.expert.0 (special case for experts) + + name = name.replace('.mlps.', '.mlp.') + name = name.replace('.router.classifier.', '.gate.') + name = name.replace('.router.e_score_correction_bias', '.e_score_correction_bias') + + # handle sub-block remapping + match = re.match(r'.*\.(\d+)\.([a-z_\.]+)\.(\d+)\..*', name) + if match and ".mlp.experts." not in name: + # convert block id from N.(name).M to (N+M).(name) + N = int(match.group(1)) + middle = match.group(2) + M = int(match.group(3)) + assert(N * 2 == bid) + new_bid = N * 2 + M + new_name = re.sub(r'\.(\d+)\.([a-z_\.]+)\.(\d+)\.', f'.{new_bid}.{middle}.', name) + print(f"Renaming tensor from {name} to {new_name}") + yield from super().modify_tensors(data_torch, new_name, new_bid) + else: + # correct block inside name + if bid is not None: + name = name.replace(f'.{bid // 2}.', f'.{bid}.', 1) + yield from super().modify_tensors(data_torch, name, bid) + + ###### CONVERSION LOGIC ###### diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 31273b2b5a..caa3dc2af4 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -459,6 +459,7 @@ class MODEL_ARCH(IntEnum): MIMO2 = auto() LLAMA_EMBED = auto() MAINCODER = auto() + LONGCAT_FLASH = auto() class VISION_PROJECTOR_TYPE(IntEnum): @@ -880,6 +881,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.MIMO2: "mimo2", MODEL_ARCH.LLAMA_EMBED: "llama-embed", MODEL_ARCH.MAINCODER: "maincoder", + MODEL_ARCH.LONGCAT_FLASH: "longcat-flash", } VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = { @@ -3377,6 +3379,36 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.LONGCAT_FLASH: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_Q_A, + MODEL_TENSOR.ATTN_Q_B, + MODEL_TENSOR.ATTN_KV_A_MQA, + MODEL_TENSOR.ATTN_KV_B, + MODEL_TENSOR.ATTN_K_B, + MODEL_TENSOR.ATTN_V_B, + MODEL_TENSOR.ATTN_Q_A_NORM, + MODEL_TENSOR.ATTN_KV_A_NORM, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_GATE_SHEXP, + MODEL_TENSOR.FFN_DOWN_SHEXP, + MODEL_TENSOR.FFN_UP_SHEXP, + MODEL_TENSOR.FFN_EXP_PROBS_B, + ], # TODO }