From 4239bf2cde234c65eaa3c3e4da9a005ae079f792 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 18 Dec 2025 23:29:14 +0100 Subject: [PATCH] add prefix "a." for conv tensors --- gguf-py/gguf/constants.py | 29 +++++++++++++++++++---------- gguf-py/gguf/tensor_mapping.py | 28 ++++++++++++++++++++-------- tools/mtmd/clip-impl.h | 4 ++++ tools/mtmd/clip.cpp | 16 ++++++++-------- 4 files changed, 51 insertions(+), 26 deletions(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index cd55ccc271..cab8f2901a 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -726,6 +726,10 @@ class MODEL_TENSOR(IntEnum): A_ENC_POS_BIAS_U = auto() A_ENC_POS_BIAS_V = auto() A_ENC_OUT = auto() + A_ENC_CONV_DW = auto() # SSM conv + A_ENC_CONV_NORM = auto() # SSM conv + A_ENC_CONV_PW1 = auto() + A_ENC_CONV_PW2 = auto() MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { @@ -1077,6 +1081,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.V_TOK_BOI: "v.boi", MODEL_TENSOR.V_TOK_EOI: "v.eoi", # audio (mtmd) + # note: all audio tensor names must use prefix "a." or "mm.a." MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd", MODEL_TENSOR.A_ENC_EMBD_NORM: "a.position_embd_norm", MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS: "a.embd_to_logits", @@ -1101,6 +1106,16 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.A_MMPROJ_FC: "mm.a.fc", MODEL_TENSOR.A_MM_NORM_PRE: "mm.a.norm_pre", MODEL_TENSOR.A_MM_NORM_MID: "mm.a.norm_mid", + # lfm2 audio + MODEL_TENSOR.A_ENC_NORM_CONV: "a.blk.{bid}.norm_conv", + MODEL_TENSOR.A_ENC_LINEAR_POS: "a.blk.{bid}.linear_pos", + MODEL_TENSOR.A_ENC_POS_BIAS_U: "a.blk.{bid}.pos_bias_u", + MODEL_TENSOR.A_ENC_POS_BIAS_V: "a.blk.{bid}.pos_bias_v", + MODEL_TENSOR.A_ENC_OUT: "a.pre_encode.out", + MODEL_TENSOR.A_ENC_CONV_DW: "a.blk.{bid}.conv_dw", + MODEL_TENSOR.A_ENC_CONV_NORM: "a.blk.{bid}.conv_norm", + MODEL_TENSOR.A_ENC_CONV_PW1: "a.blk.{bid}.conv_pw1", + MODEL_TENSOR.A_ENC_CONV_PW2: "a.blk.{bid}.conv_pw2", # NextN/MTP MODEL_TENSOR.NEXTN_EH_PROJ: "blk.{bid}.nextn.eh_proj", MODEL_TENSOR.NEXTN_EMBED_TOKENS: "blk.{bid}.nextn.embed_tokens", @@ -1108,12 +1123,6 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.NEXTN_HNORM: "blk.{bid}.nextn.hnorm", MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD: "blk.{bid}.nextn.shared_head_head", MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: "blk.{bid}.nextn.shared_head_norm", - # lfm2 - MODEL_TENSOR.A_ENC_NORM_CONV: "a.blk.{bid}.norm_conv", - MODEL_TENSOR.A_ENC_LINEAR_POS: "a.blk.{bid}.linear_pos", - MODEL_TENSOR.A_ENC_POS_BIAS_U: "a.blk.{bid}.pos_bias_u", - MODEL_TENSOR.A_ENC_POS_BIAS_V: "a.blk.{bid}.pos_bias_v", - MODEL_TENSOR.A_ENC_OUT: "a.pre_encode.out", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -1194,15 +1203,15 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.A_MMPROJ_FC, MODEL_TENSOR.A_MM_NORM_PRE, MODEL_TENSOR.A_MM_NORM_MID, - MODEL_TENSOR.CONVNEXT_DW, - MODEL_TENSOR.CONVNEXT_NORM, - MODEL_TENSOR.CONVNEXT_PW1, - MODEL_TENSOR.CONVNEXT_PW2, MODEL_TENSOR.A_ENC_NORM_CONV, MODEL_TENSOR.A_ENC_LINEAR_POS, MODEL_TENSOR.A_ENC_POS_BIAS_U, MODEL_TENSOR.A_ENC_POS_BIAS_V, MODEL_TENSOR.A_ENC_OUT, + MODEL_TENSOR.A_ENC_CONV_DW, + MODEL_TENSOR.A_ENC_CONV_NORM, + MODEL_TENSOR.A_ENC_CONV_PW1, + MODEL_TENSOR.A_ENC_CONV_PW2, ], MODEL_ARCH.LLAMA: [ MODEL_TENSOR.TOKEN_EMBD, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 7cee9b36a1..301aafa910 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1131,26 +1131,18 @@ class TensorNameMap: MODEL_TENSOR.CONVNEXT_DW: ( "backbone.convnext.{bid}.dwconv", # wavtokenizer - "conformer.layers.{bid}.conv.depthwise_conv", # lfm2 ), MODEL_TENSOR.CONVNEXT_NORM: ( "backbone.convnext.{bid}.norm", # wavtokenizer - "conformer.layers.{bid}.conv.batch_norm", # lfm2 ), MODEL_TENSOR.CONVNEXT_PW1: ( "backbone.convnext.{bid}.pwconv1", # wavtokenizer - "conformer.layers.{bid}.conv.pointwise_conv1", # lfm2 ), MODEL_TENSOR.CONVNEXT_PW2: ( "backbone.convnext.{bid}.pwconv2", # wavtokenizer - "conformer.layers.{bid}.conv.pointwise_conv2", # lfm2 - ), - - MODEL_TENSOR.A_ENC_NORM_CONV: ( - "conformer.layers.{bid}.norm_conv", # lfm2 ), MODEL_TENSOR.CONVNEXT_GAMMA: ( @@ -1661,6 +1653,26 @@ class TensorNameMap: "audio.multi_modal_projector.ln_mid", # ultravox ), + MODEL_TENSOR.A_ENC_CONV_DW: ( + "conformer.layers.{bid}.conv.depthwise_conv", # lfm2 + ), + + MODEL_TENSOR.A_ENC_CONV_NORM: ( + "conformer.layers.{bid}.conv.batch_norm", # lfm2 + ), + + MODEL_TENSOR.A_ENC_CONV_PW1: ( + "conformer.layers.{bid}.conv.pointwise_conv1", # lfm2 + ), + + MODEL_TENSOR.A_ENC_CONV_PW2: ( + "conformer.layers.{bid}.conv.pointwise_conv2", # lfm2 + ), + + MODEL_TENSOR.A_ENC_NORM_CONV: ( + "conformer.layers.{bid}.norm_conv", # lfm2 + ), + # NextN/MTP tensors for GLM4_MOE MODEL_TENSOR.NEXTN_EH_PROJ: ( "model.layers.{bid}.eh_proj", diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index dbcca5cad8..a0939865e3 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -148,6 +148,10 @@ #define TN_POS_BIAS_V "%s.blk.%d.pos_bias_v" #define TN_NORM_CONV "%s.blk.%d.norm_conv.%s" #define TN_LINEAR_POS "%s.blk.%d.linear_pos.%s" +#define TN_CONV_DW "%s.blk.%d.conv_dw.%s" +#define TN_CONV_NORM "%s.blk.%d.conv_norm.%s" +#define TN_CONV_PW1 "%s.blk.%d.conv_pw1.%s" +#define TN_CONV_PW2 "%s.blk.%d.conv_pw2.%s" // align x to upper multiple of n #define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n)) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index f0de25964b..3ba0823def 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1660,14 +1660,14 @@ struct clip_model_loader { layer.linear_pos_w = get_tensor(string_format(TN_LINEAR_POS, prefix, il, "weight")); - layer.conv_norm_w = get_tensor(string_format("convnext.%d.norm.%s", il, "weight")); - layer.conv_norm_b = get_tensor(string_format("convnext.%d.norm.%s", il, "bias")); - layer.conv_dw_w = get_tensor(string_format("convnext.%d.dw.%s", il, "weight")); - layer.conv_dw_b = get_tensor(string_format("convnext.%d.dw.%s", il, "bias")); - layer.conv_pw1_w = get_tensor(string_format("convnext.%d.pw1.%s", il, "weight")); - layer.conv_pw1_b = get_tensor(string_format("convnext.%d.pw1.%s", il, "bias")); - layer.conv_pw2_w = get_tensor(string_format("convnext.%d.pw2.%s", il, "weight")); - layer.conv_pw2_b = get_tensor(string_format("convnext.%d.pw2.%s", il, "bias")); + layer.conv_norm_w = get_tensor(string_format(TN_CONV_NORM, prefix, il, "weight")); + layer.conv_norm_b = get_tensor(string_format(TN_CONV_NORM, prefix, il, "bias")); + layer.conv_dw_w = get_tensor(string_format(TN_CONV_DW, prefix, il, "weight")); + layer.conv_dw_b = get_tensor(string_format(TN_CONV_DW, prefix, il, "bias")); + layer.conv_pw1_w = get_tensor(string_format(TN_CONV_PW1, prefix, il, "weight")); + layer.conv_pw1_b = get_tensor(string_format(TN_CONV_PW1, prefix, il, "bias")); + layer.conv_pw2_w = get_tensor(string_format(TN_CONV_PW2, prefix, il, "weight")); + layer.conv_pw2_b = get_tensor(string_format(TN_CONV_PW2, prefix, il, "bias")); } } break; default: