wip
This commit is contained in:
parent
b45ef2702c
commit
44bc40fdd9
|
|
@ -7513,6 +7513,7 @@ class DeepseekV2Model(TextModel):
|
|||
self._experts[bid][name] = data_torch
|
||||
|
||||
if len(self._experts[bid]) >= n_experts * 3:
|
||||
print("->>>> Merging experts for block", bid, '\n'.join(self._experts[bid].keys()))
|
||||
# merge the experts into a single 3d tensor
|
||||
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
||||
datas: list[Tensor] = []
|
||||
|
|
@ -10914,6 +10915,53 @@ class SolarOpenModel(Glm4MoeModel):
|
|||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
|
||||
@ModelBase.register("LongcatFlashForCausalLM")
|
||||
class LongcatFlashModel(DeepseekV2Model):
|
||||
model_arch = gguf.MODEL_ARCH.LONGCAT_FLASH
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
# the model use double block, we need to adjust block count
|
||||
self.block_count = self.hparams["num_layers"] * 2
|
||||
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||
# compat with deepseek2 base class hparam
|
||||
self.hparams["num_hidden_layers"] = self.block_count
|
||||
self.hparams["num_key_value_heads"] = self.hparams["num_attention_heads"]
|
||||
self.hparams["intermediate_size"] = self.hparams["ffn_hidden_size"]
|
||||
self.hparams["moe_intermediate_size"] = self.hparams["expert_ffn_hidden_size"]
|
||||
|
||||
def modify_tensors(self, data_torch, name, bid):
|
||||
if bid is not None:
|
||||
bid = bid * 2 # double block id
|
||||
|
||||
# Rename rules examples:
|
||||
# model.layers.1.input_layernorm.0.weight --> model.layers.1.input_layernorm.weight
|
||||
# model.layers.1.input_layernorm.1.weight --> model.layers.2.input_layernorm.weight
|
||||
# model.layers.1.mlp.experts.0 --> model.layers.2.mlp.expert.0 (special case for experts)
|
||||
|
||||
name = name.replace('.mlps.', '.mlp.')
|
||||
name = name.replace('.router.classifier.', '.gate.')
|
||||
name = name.replace('.router.e_score_correction_bias', '.e_score_correction_bias')
|
||||
|
||||
# handle sub-block remapping
|
||||
match = re.match(r'.*\.(\d+)\.([a-z_\.]+)\.(\d+)\..*', name)
|
||||
if match and ".mlp.experts." not in name:
|
||||
# convert block id from N.(name).M to (N+M).(name)
|
||||
N = int(match.group(1))
|
||||
middle = match.group(2)
|
||||
M = int(match.group(3))
|
||||
assert(N * 2 == bid)
|
||||
new_bid = N * 2 + M
|
||||
new_name = re.sub(r'\.(\d+)\.([a-z_\.]+)\.(\d+)\.', f'.{new_bid}.{middle}.', name)
|
||||
print(f"Renaming tensor from {name} to {new_name}")
|
||||
yield from super().modify_tensors(data_torch, new_name, new_bid)
|
||||
else:
|
||||
# correct block inside name
|
||||
if bid is not None:
|
||||
name = name.replace(f'.{bid // 2}.', f'.{bid}.', 1)
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
###### CONVERSION LOGIC ######
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -459,6 +459,7 @@ class MODEL_ARCH(IntEnum):
|
|||
MIMO2 = auto()
|
||||
LLAMA_EMBED = auto()
|
||||
MAINCODER = auto()
|
||||
LONGCAT_FLASH = auto()
|
||||
|
||||
|
||||
class VISION_PROJECTOR_TYPE(IntEnum):
|
||||
|
|
@ -880,6 +881,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|||
MODEL_ARCH.MIMO2: "mimo2",
|
||||
MODEL_ARCH.LLAMA_EMBED: "llama-embed",
|
||||
MODEL_ARCH.MAINCODER: "maincoder",
|
||||
MODEL_ARCH.LONGCAT_FLASH: "longcat-flash",
|
||||
}
|
||||
|
||||
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
|
||||
|
|
@ -3377,6 +3379,36 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.LONGCAT_FLASH: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_Q_A,
|
||||
MODEL_TENSOR.ATTN_Q_B,
|
||||
MODEL_TENSOR.ATTN_KV_A_MQA,
|
||||
MODEL_TENSOR.ATTN_KV_B,
|
||||
MODEL_TENSOR.ATTN_K_B,
|
||||
MODEL_TENSOR.ATTN_V_B,
|
||||
MODEL_TENSOR.ATTN_Q_A_NORM,
|
||||
MODEL_TENSOR.ATTN_KV_A_NORM,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
MODEL_TENSOR.FFN_GATE_INP,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FFN_GATE_EXP,
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
MODEL_TENSOR.FFN_GATE_SHEXP,
|
||||
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||
MODEL_TENSOR.FFN_UP_SHEXP,
|
||||
MODEL_TENSOR.FFN_EXP_PROBS_B,
|
||||
],
|
||||
# TODO
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue