This commit is contained in:
Xuan Son Nguyen 2026-01-29 16:06:06 +01:00
parent b45ef2702c
commit 44bc40fdd9
2 changed files with 80 additions and 0 deletions

View File

@ -7513,6 +7513,7 @@ class DeepseekV2Model(TextModel):
self._experts[bid][name] = data_torch
if len(self._experts[bid]) >= n_experts * 3:
print("->>>> Merging experts for block", bid, '\n'.join(self._experts[bid].keys()))
# merge the experts into a single 3d tensor
for w_name in ["down_proj", "gate_proj", "up_proj"]:
datas: list[Tensor] = []
@ -10914,6 +10915,53 @@ class SolarOpenModel(Glm4MoeModel):
special_vocab.add_to_gguf(self.gguf_writer)
@ModelBase.register("LongcatFlashForCausalLM")
class LongcatFlashModel(DeepseekV2Model):
model_arch = gguf.MODEL_ARCH.LONGCAT_FLASH
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# the model use double block, we need to adjust block count
self.block_count = self.hparams["num_layers"] * 2
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
# compat with deepseek2 base class hparam
self.hparams["num_hidden_layers"] = self.block_count
self.hparams["num_key_value_heads"] = self.hparams["num_attention_heads"]
self.hparams["intermediate_size"] = self.hparams["ffn_hidden_size"]
self.hparams["moe_intermediate_size"] = self.hparams["expert_ffn_hidden_size"]
def modify_tensors(self, data_torch, name, bid):
if bid is not None:
bid = bid * 2 # double block id
# Rename rules examples:
# model.layers.1.input_layernorm.0.weight --> model.layers.1.input_layernorm.weight
# model.layers.1.input_layernorm.1.weight --> model.layers.2.input_layernorm.weight
# model.layers.1.mlp.experts.0 --> model.layers.2.mlp.expert.0 (special case for experts)
name = name.replace('.mlps.', '.mlp.')
name = name.replace('.router.classifier.', '.gate.')
name = name.replace('.router.e_score_correction_bias', '.e_score_correction_bias')
# handle sub-block remapping
match = re.match(r'.*\.(\d+)\.([a-z_\.]+)\.(\d+)\..*', name)
if match and ".mlp.experts." not in name:
# convert block id from N.(name).M to (N+M).(name)
N = int(match.group(1))
middle = match.group(2)
M = int(match.group(3))
assert(N * 2 == bid)
new_bid = N * 2 + M
new_name = re.sub(r'\.(\d+)\.([a-z_\.]+)\.(\d+)\.', f'.{new_bid}.{middle}.', name)
print(f"Renaming tensor from {name} to {new_name}")
yield from super().modify_tensors(data_torch, new_name, new_bid)
else:
# correct block inside name
if bid is not None:
name = name.replace(f'.{bid // 2}.', f'.{bid}.', 1)
yield from super().modify_tensors(data_torch, name, bid)
###### CONVERSION LOGIC ######

View File

@ -459,6 +459,7 @@ class MODEL_ARCH(IntEnum):
MIMO2 = auto()
LLAMA_EMBED = auto()
MAINCODER = auto()
LONGCAT_FLASH = auto()
class VISION_PROJECTOR_TYPE(IntEnum):
@ -880,6 +881,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.MIMO2: "mimo2",
MODEL_ARCH.LLAMA_EMBED: "llama-embed",
MODEL_ARCH.MAINCODER: "maincoder",
MODEL_ARCH.LONGCAT_FLASH: "longcat-flash",
}
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
@ -3377,6 +3379,36 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.LONGCAT_FLASH: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_Q_A,
MODEL_TENSOR.ATTN_Q_B,
MODEL_TENSOR.ATTN_KV_A_MQA,
MODEL_TENSOR.ATTN_KV_B,
MODEL_TENSOR.ATTN_K_B,
MODEL_TENSOR.ATTN_V_B,
MODEL_TENSOR.ATTN_Q_A_NORM,
MODEL_TENSOR.ATTN_KV_A_NORM,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.ATTN_ROT_EMBD,
MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.FFN_GATE_EXP,
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
MODEL_TENSOR.FFN_GATE_SHEXP,
MODEL_TENSOR.FFN_DOWN_SHEXP,
MODEL_TENSOR.FFN_UP_SHEXP,
MODEL_TENSOR.FFN_EXP_PROBS_B,
],
# TODO
}