From 145b6280d6aa1890e22457143dad0badb3fd6820 Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Mon, 15 Dec 2025 14:32:13 +0100 Subject: [PATCH 1/8] ASR with LFM2-Audio-1.5B --- common/arg.cpp | 2 +- convert_hf_to_gguf.py | 81 +++++++- ggml/src/ggml-cuda/ssm-conv.cu | 34 ++-- gguf-py/gguf/constants.py | 43 +++++ gguf-py/gguf/tensor_mapping.py | 59 ++++++ tests/test-backend-ops.cpp | 8 +- tools/mtmd/CMakeLists.txt | 1 + tools/mtmd/clip-impl.h | 13 ++ tools/mtmd/clip-model.h | 31 ++++ tools/mtmd/clip.cpp | 86 +++++++++ tools/mtmd/models/lfm2-audio-enc.cpp | 267 +++++++++++++++++++++++++++ tools/mtmd/models/models.h | 5 + tools/mtmd/mtmd-audio.cpp | 52 ++++++ tools/mtmd/mtmd-audio.h | 6 + tools/mtmd/mtmd-cli.cpp | 15 ++ tools/mtmd/mtmd.cpp | 3 + 16 files changed, 677 insertions(+), 29 deletions(-) create mode 100644 tools/mtmd/models/lfm2-audio-enc.cpp diff --git a/common/arg.cpp b/common/arg.cpp index acf4c8f8a8..90416117d7 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1172,7 +1172,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.system_prompt = value; } - ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION})); + ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION, LLAMA_EXAMPLE_MTMD})); add_opt(common_arg( {"--perf"}, {"--no-perf"}, diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index e1c78e3b18..a21ffc5da3 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -711,6 +711,9 @@ class ModelBase: if "thinker_config" in config: # rename for Qwen2.5-Omni config["text_config"] = config["thinker_config"]["text_config"] + if "lfm" in config: + # rename for LFM2-Audio + config["text_config"] = config["lfm"] return config @classmethod @@ -9567,12 +9570,12 @@ class LFM2Model(TextModel): self._add_feed_forward_length() def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - is_vision_tensor = "vision_tower" in name or "multi_modal_projector" in name - if is_vision_tensor: - # skip vision tensors + if self._is_vision_tensor(name) or self._is_audio_tensor(name): + # skip multimodal tensors return [] - name = name.replace("language_model.", "") + name = name.replace("language_model.", "") # vision + name = name.replace("lfm.", "model.") # audio # conv op requires 2d tensor if 'conv.conv' in name: @@ -9580,6 +9583,12 @@ class LFM2Model(TextModel): return [(self.map_tensor_name(name), data_torch)] + def _is_vision_tensor(self, name: str) -> bool: + return "vision_tower" in name or "multi_modal_projector" in name + + def _is_audio_tensor(self, name: str): + return any(p in name for p in ["audio", "codebook", "conformer", "depth_embedding", "depthformer", "depth_linear"]) + @ModelBase.register("Lfm2MoeForCausalLM") class LFM2MoeModel(TextModel): @@ -9685,6 +9694,70 @@ class LFM2VLModel(MmprojModel): return [] # skip other tensors +@ModelBase.register("Lfm2AudioForConditionalGeneration") +class LFM2AudioModel(MmprojModel): + has_vision_encoder = False + has_audio_encoder = True + model_name = "Lfm2AudioEncoder" + + _batch_norm_tensors: list[dict[str, Tensor]] | None = None + + def get_audio_config(self) -> dict[str, Any] | None: + return self.global_config.get("encoder") + + def set_gguf_parameters(self): + self.hparams_audio["hidden_size"] = self.hparams_audio["d_model"] + self.hparams_audio["intermediate_size"] = self.hparams_audio["d_model"] + self.hparams_audio["num_attention_heads"] = self.hparams_audio["n_heads"] + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LFM2A) + self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"]) + self.gguf_writer.add_audio_attention_layernorm_eps(1e-5) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if ".conv" in name and ".weight" in name: + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # skip language model tensors + if name.startswith("lfm."): + return [] + + # for training only + if any(p in name for p in ["audio_loss_weight"]): + return [] + + # for audio output + if any(p in name for p in ["codebook_offsets", "depth_embeddings", "depth_linear", "depthformer"]): + return [] + + # fold running_mean, running_var and eps into weight and bias for batch_norm + if "batch_norm" in name: + if self._batch_norm_tensors is None: + self._batch_norm_tensors = [{} for _ in range(self.block_count)] + assert bid is not None + self._batch_norm_tensors[bid][name] = data_torch + + if len(self._batch_norm_tensors[bid]) < 5: + return [] + + weight = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.weight"] + bias = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.bias"] + running_mean = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_mean"] + running_var = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_var"] + eps = 1e-5 # default value + + a = weight / torch.sqrt(running_var + eps) + b = bias - running_mean * a + return [ + (self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.weight"), a), + (self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.bias"), b), + ] + + return [(self.map_tensor_name(name), data_torch)] + + @ModelBase.register("SmallThinkerForCausalLM") class SmallThinkerModel(TextModel): model_arch = gguf.MODEL_ARCH.SMALLTHINKER diff --git a/ggml/src/ggml-cuda/ssm-conv.cu b/ggml/src/ggml-cuda/ssm-conv.cu index 4197973360..6d5ea704c6 100644 --- a/ggml/src/ggml-cuda/ssm-conv.cu +++ b/ggml/src/ggml-cuda/ssm-conv.cu @@ -102,31 +102,25 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int const int threads = 128; GGML_ASSERT(nr % threads == 0); - if (n_t <= 32) { - const dim3 blocks(n_s, (nr + threads - 1) / threads, 1); - if (nc == 4) { - ssm_conv_f32<<>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, - dst, dst_nb0, dst_nb1, dst_nb2, n_t); - } else if (nc == 3) { - ssm_conv_f32<<>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, - dst, dst_nb0, dst_nb1, dst_nb2, n_t); + auto launch_kernel = [&](auto NC) { + constexpr int kNC = decltype(NC)::value; + if (n_t <= 32) { + const dim3 blocks(n_s, (nr + threads - 1) / threads, 1); + ssm_conv_f32<<>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, + dst, dst_nb0, dst_nb1, dst_nb2, n_t); } else { - GGML_ABORT("Only support kernel size = 3 or size = 4 right now."); - } - } else { - if (nc == 4) { const int64_t split_n_t = 32; dim3 blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t); - ssm_conv_long_token_f32<<>>( + ssm_conv_long_token_f32<<>>( src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t); - } else if (nc == 3) { - const int64_t split_n_t = 32; - dim3 blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t); - ssm_conv_long_token_f32<<>>( - src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t); - } else { - GGML_ABORT("Only support kernel size = 3 or size = 4 right now."); } + }; + + switch (nc) { + case 3: launch_kernel(std::integral_constant{}); break; + case 4: launch_kernel(std::integral_constant{}); break; + case 9: launch_kernel(std::integral_constant{}); break; + default: GGML_ABORT("Only support kernel sizes 3, 4, 9 right now."); } } diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 8ef4a23a10..a26f618b7d 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -687,6 +687,8 @@ class MODEL_TENSOR(IntEnum): V_TOK_EOI = auto() # cogvlm # audio (mtmd) A_ENC_EMBD_POS = auto() + A_ENC_EMBD_NORM = auto() + A_ENC_EMBD_TO_LOGITS = auto() A_ENC_CONV1D = auto() A_PRE_NORM = auto() A_POST_NORM = auto() @@ -697,8 +699,13 @@ class MODEL_TENSOR(IntEnum): A_ENC_OUTPUT = auto() A_ENC_OUTPUT_NORM = auto() A_ENC_FFN_UP = auto() + A_ENC_FFN_NORM = auto() A_ENC_FFN_GATE = auto() A_ENC_FFN_DOWN = auto() + A_ENC_FFN_UP_1 = auto() + A_ENC_FFN_NORM_1 = auto() + A_ENC_FFN_GATE_1 = auto() + A_ENC_FFN_DOWN_1 = auto() A_MMPROJ = auto() A_MMPROJ_FC = auto() A_MM_NORM_PRE = auto() @@ -710,6 +717,12 @@ class MODEL_TENSOR(IntEnum): NEXTN_HNORM = auto() NEXTN_SHARED_HEAD_HEAD = auto() NEXTN_SHARED_HEAD_NORM = auto() + # lfm2 audio + A_ENC_NORM_CONV = auto() + A_ENC_LINEAR_POS = auto() + A_ENC_POS_BIAS_U = auto() + A_ENC_POS_BIAS_V = auto() + A_ENC_OUT = auto() MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { @@ -1059,6 +1072,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.V_TOK_EOI: "v.eoi", # audio (mtmd) MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd", + MODEL_TENSOR.A_ENC_EMBD_NORM: "a.position_embd_norm", + MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS: "a.embd_to_logits", MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}", MODEL_TENSOR.A_PRE_NORM: "a.pre_ln", MODEL_TENSOR.A_POST_NORM: "a.post_ln", @@ -1068,9 +1083,14 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.A_ENC_INPUT_NORM: "a.blk.{bid}.ln1", MODEL_TENSOR.A_ENC_OUTPUT: "a.blk.{bid}.attn_out", MODEL_TENSOR.A_ENC_OUTPUT_NORM: "a.blk.{bid}.ln2", + MODEL_TENSOR.A_ENC_FFN_NORM: "a.blk.{bid}.ffn_norm", MODEL_TENSOR.A_ENC_FFN_UP: "a.blk.{bid}.ffn_up", MODEL_TENSOR.A_ENC_FFN_GATE: "a.blk.{bid}.ffn_gate", MODEL_TENSOR.A_ENC_FFN_DOWN: "a.blk.{bid}.ffn_down", + MODEL_TENSOR.A_ENC_FFN_NORM_1: "a.blk.{bid}.ffn_norm_1", + MODEL_TENSOR.A_ENC_FFN_UP_1: "a.blk.{bid}.ffn_up_1", + MODEL_TENSOR.A_ENC_FFN_GATE_1: "a.blk.{bid}.ffn_gate_1", + MODEL_TENSOR.A_ENC_FFN_DOWN_1: "a.blk.{bid}.ffn_down_1", MODEL_TENSOR.A_MMPROJ: "mm.a.mlp.{bid}", MODEL_TENSOR.A_MMPROJ_FC: "mm.a.fc", MODEL_TENSOR.A_MM_NORM_PRE: "mm.a.norm_pre", @@ -1082,6 +1102,12 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.NEXTN_HNORM: "blk.{bid}.nextn.hnorm", MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD: "blk.{bid}.nextn.shared_head_head", MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: "blk.{bid}.nextn.shared_head_norm", + # lfm2 + MODEL_TENSOR.A_ENC_NORM_CONV: "a.blk.{bid}.norm_conv", + MODEL_TENSOR.A_ENC_LINEAR_POS: "a.blk.{bid}.linear_pos", + MODEL_TENSOR.A_ENC_POS_BIAS_U: "a.blk.{bid}.pos_bias_u", + MODEL_TENSOR.A_ENC_POS_BIAS_V: "a.blk.{bid}.pos_bias_v", + MODEL_TENSOR.A_ENC_OUT: "a.pre_encode.out", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -1137,6 +1163,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.V_TOK_EOI, # audio MODEL_TENSOR.A_ENC_EMBD_POS, + MODEL_TENSOR.A_ENC_EMBD_NORM, + MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS, MODEL_TENSOR.A_ENC_CONV1D, MODEL_TENSOR.A_PRE_NORM, MODEL_TENSOR.A_POST_NORM, @@ -1146,13 +1174,27 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.A_ENC_INPUT_NORM, MODEL_TENSOR.A_ENC_OUTPUT, MODEL_TENSOR.A_ENC_OUTPUT_NORM, + MODEL_TENSOR.A_ENC_FFN_NORM, MODEL_TENSOR.A_ENC_FFN_UP, MODEL_TENSOR.A_ENC_FFN_GATE, MODEL_TENSOR.A_ENC_FFN_DOWN, + MODEL_TENSOR.A_ENC_FFN_NORM_1, + MODEL_TENSOR.A_ENC_FFN_UP_1, + MODEL_TENSOR.A_ENC_FFN_GATE_1, + MODEL_TENSOR.A_ENC_FFN_DOWN_1, MODEL_TENSOR.A_MMPROJ, MODEL_TENSOR.A_MMPROJ_FC, MODEL_TENSOR.A_MM_NORM_PRE, MODEL_TENSOR.A_MM_NORM_MID, + MODEL_TENSOR.CONVNEXT_DW, + MODEL_TENSOR.CONVNEXT_NORM, + MODEL_TENSOR.CONVNEXT_PW1, + MODEL_TENSOR.CONVNEXT_PW2, + MODEL_TENSOR.A_ENC_NORM_CONV, + MODEL_TENSOR.A_ENC_LINEAR_POS, + MODEL_TENSOR.A_ENC_POS_BIAS_U, + MODEL_TENSOR.A_ENC_POS_BIAS_V, + MODEL_TENSOR.A_ENC_OUT, ], MODEL_ARCH.LLAMA: [ MODEL_TENSOR.TOKEN_EMBD, @@ -3328,6 +3370,7 @@ class VisionProjectorType: LIGHTONOCR = "lightonocr" COGVLM = "cogvlm" JANUS_PRO = "janus_pro" + LFM2A = "lfm2a" # audio # Items here are (block size, type size) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index b320e2b4b2..1ae7f97260 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1126,18 +1126,26 @@ class TensorNameMap: MODEL_TENSOR.CONVNEXT_DW: ( "backbone.convnext.{bid}.dwconv", # wavtokenizer + "conformer.layers.{bid}.conv.depthwise_conv", # lfm2 ), MODEL_TENSOR.CONVNEXT_NORM: ( "backbone.convnext.{bid}.norm", # wavtokenizer + "conformer.layers.{bid}.conv.batch_norm", #lfm2 ), MODEL_TENSOR.CONVNEXT_PW1: ( "backbone.convnext.{bid}.pwconv1", # wavtokenizer + "conformer.layers.{bid}.conv.pointwise_conv1", # lfm2 ), MODEL_TENSOR.CONVNEXT_PW2: ( "backbone.convnext.{bid}.pwconv2", # wavtokenizer + "conformer.layers.{bid}.conv.pointwise_conv2", # lfm2 + ), + + MODEL_TENSOR.A_ENC_NORM_CONV: ( + "conformer.layers.{bid}.norm_conv", # lfm2 ), MODEL_TENSOR.CONVNEXT_GAMMA: ( @@ -1515,10 +1523,20 @@ class TensorNameMap: MODEL_TENSOR.A_ENC_EMBD_POS: ( "audio_tower.embed_positions", # ultravox + "audio_embedding.embedding", # lfm2 + ), + + MODEL_TENSOR.A_ENC_EMBD_NORM: ( + "audio_embedding.embedding_norm", # lfm2 + ), + + MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS: ( + "audio_embedding.to_logits", # lfm2 ), MODEL_TENSOR.A_ENC_CONV1D: ( "audio_tower.conv{bid}", # ultravox + "conformer.pre_encode.conv.{bid}", # lfm2 ), MODEL_TENSOR.A_PRE_NORM: (), @@ -1530,36 +1548,76 @@ class TensorNameMap: MODEL_TENSOR.A_ENC_ATTN_Q: ( "audio_tower.layers.{bid}.self_attn.q_proj", # ultravox + "conformer.layers.{bid}.self_attn.linear_q", # lfm2 ), MODEL_TENSOR.A_ENC_ATTN_K: ( "audio_tower.layers.{bid}.self_attn.k_proj", # ultravox + "conformer.layers.{bid}.self_attn.linear_k", # lfm2 ), MODEL_TENSOR.A_ENC_ATTN_V: ( "audio_tower.layers.{bid}.self_attn.v_proj", # ultravox + "conformer.layers.{bid}.self_attn.linear_v", # lfm2 ), MODEL_TENSOR.A_ENC_INPUT_NORM: ( "audio_tower.layers.{bid}.self_attn_layer_norm", # ultravox + "conformer.layers.{bid}.norm_self_att", # lfm2 ), MODEL_TENSOR.A_ENC_OUTPUT: ( "audio_tower.layers.{bid}.self_attn.out_proj", # ultravox + "conformer.layers.{bid}.self_attn.linear_out", # lfm2 ), MODEL_TENSOR.A_ENC_OUTPUT_NORM: ( "audio_tower.layers.{bid}.final_layer_norm", # ultravox + "conformer.layers.{bid}.norm_out", # lfm2 + ), + + MODEL_TENSOR.A_ENC_FFN_NORM: ( + "conformer.layers.{bid}.norm_feed_forward1", # lfm2 ), MODEL_TENSOR.A_ENC_FFN_UP: ( "audio_tower.layers.{bid}.fc1", # ultravox + "conformer.layers.{bid}.feed_forward1.linear1", # lfm2 ), MODEL_TENSOR.A_ENC_FFN_GATE: (), MODEL_TENSOR.A_ENC_FFN_DOWN: ( "audio_tower.layers.{bid}.fc2", # ultravox + "conformer.layers.{bid}.feed_forward1.linear2", # lfm2 + ), + + MODEL_TENSOR.A_ENC_FFN_UP_1: ( + "conformer.layers.{bid}.feed_forward2.linear1", # lfm2 + ), + + MODEL_TENSOR.A_ENC_FFN_DOWN_1: ( + "conformer.layers.{bid}.feed_forward2.linear2", # lfm2 + ), + + MODEL_TENSOR.A_ENC_FFN_NORM_1: ( + "conformer.layers.{bid}.norm_feed_forward2", # lfm2 + ), + + MODEL_TENSOR.A_ENC_LINEAR_POS: ( + "conformer.layers.{bid}.self_attn.linear_pos", # lfm2 + ), + + MODEL_TENSOR.A_ENC_POS_BIAS_U: ( + "conformer.layers.{bid}.self_attn.pos_bias_u", # lfm2 + ), + + MODEL_TENSOR.A_ENC_POS_BIAS_V: ( + "conformer.layers.{bid}.self_attn.pos_bias_v", # lfm2 + ), + + MODEL_TENSOR.A_ENC_OUT: ( + "conformer.pre_encode.out", # lfm2 ), # note: some tensors below has "audio." pseudo-prefix, to prevent conflicts with vision tensors @@ -1567,6 +1625,7 @@ class TensorNameMap: MODEL_TENSOR.A_MMPROJ: ( "audio.multi_modal_projector.linear_{bid}", # ultravox + "audio_adapter.model.{bid}" # lfm2 ), MODEL_TENSOR.A_MMPROJ_FC: ( diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 416218b5b8..53c172fd00 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -7295,11 +7295,11 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, {64, 5, 4, 3}, 1e-12f)); - for (int64_t d_conv : {3, 4}) { + for (int64_t d_conv : {3, 4, 9}) { for (int64_t d_inner: {1024, 1536, 2048}) { - test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, d_inner, 1, 1}, {d_conv, d_inner, 1, 1})); - test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {8, d_inner, 1, 1}, {d_conv, d_inner, 1, 1})); - test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, d_inner, 4, 1}, {d_conv, d_inner, 1, 1})); + test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {d_conv, d_inner, 1, 1}, {d_conv, d_inner, 1, 1})); + test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {2 * d_conv, d_inner, 1, 1}, {d_conv, d_inner, 1, 1})); + test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {d_conv, d_inner, 4, 1}, {d_conv, d_inner, 1, 1})); } } diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index 3ee42036fd..14a5bac07b 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -17,6 +17,7 @@ add_library(mtmd models/cogvlm.cpp models/internvl.cpp models/kimivl.cpp + models/lfm2-audio-enc.cpp models/llama4.cpp models/llava.cpp models/minicpmv.cpp diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 9ef0d301e4..4411478459 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -133,6 +133,17 @@ #define TN_TOK_BOI "v.boi" #define TN_TOK_EOI "v.eoi" +// lfm2 +#define TN_PRE_ENCODE_OUT "a.pre_encode.out.%s" +#define TN_FFN_NORM "%s.blk.%d.ffn_norm.%s" +#define TN_FFN_NORM_1 "%s.blk.%d.ffn_norm_1.%s" +#define TN_FFN_UP_1 "%s.blk.%d.ffn_up_1.%s" +#define TN_FFN_DOWN_1 "%s.blk.%d.ffn_down_1.%s" +#define TN_POS_BIAS_U "%s.blk.%d.pos_bias_u" +#define TN_POS_BIAS_V "%s.blk.%d.pos_bias_v" +#define TN_NORM_CONV "%s.blk.%d.norm_conv.%s" +#define TN_LINEAR_POS "%s.blk.%d.linear_pos.%s" + // align x to upper multiple of n #define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n)) @@ -165,6 +176,7 @@ enum projector_type { PROJECTOR_TYPE_LIGHTONOCR, PROJECTOR_TYPE_COGVLM, PROJECTOR_TYPE_JANUS_PRO, + PROJECTOR_TYPE_LFM2A, PROJECTOR_TYPE_UNKNOWN, }; @@ -192,6 +204,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"}, { PROJECTOR_TYPE_COGVLM, "cogvlm"}, { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"}, + { PROJECTOR_TYPE_LFM2A, "lfm2a"}, }; static projector_type clip_projector_type_from_string(const std::string & str) { diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index 2f7dbb458e..94fbd5f47b 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -4,6 +4,7 @@ #include "clip.h" #include "clip-impl.h" +#include #include #include #include @@ -142,6 +143,30 @@ struct clip_layer { ggml_tensor * deepstack_fc2_w = nullptr; ggml_tensor * deepstack_fc2_b = nullptr; + // lfm2 + ggml_tensor * ff_norm_w = nullptr; + ggml_tensor * ff_norm_b = nullptr; + ggml_tensor * ff_norm_1_w = nullptr; + ggml_tensor * ff_norm_1_b = nullptr; + ggml_tensor * ff_up_1_w = nullptr; + ggml_tensor * ff_up_1_b = nullptr; + ggml_tensor * ff_down_1_w = nullptr; + ggml_tensor * ff_down_1_b = nullptr; + ggml_tensor * pos_bias_u = nullptr; + ggml_tensor * pos_bias_v = nullptr; + ggml_tensor * norm_conv_w = nullptr; + ggml_tensor * norm_conv_b = nullptr; + ggml_tensor * linear_pos_w = nullptr; + + ggml_tensor * conv_norm_w = nullptr; + ggml_tensor * conv_norm_b = nullptr; + ggml_tensor * conv_dw_w = nullptr; + ggml_tensor * conv_dw_b = nullptr; + ggml_tensor * conv_pw1_w = nullptr; + ggml_tensor * conv_pw1_b = nullptr; + ggml_tensor * conv_pw2_w = nullptr; + ggml_tensor * conv_pw2_b = nullptr; + bool has_deepstack() const { return deepstack_fc1_w != nullptr; } @@ -275,6 +300,12 @@ struct clip_model { ggml_tensor * mm_boi = nullptr; ggml_tensor * mm_eoi = nullptr; + // lfm2 + std::array pre_encode_conv_X_w = {nullptr}; + std::array pre_encode_conv_X_b = {nullptr}; + ggml_tensor * pre_encode_out_w = nullptr; + ggml_tensor * pre_encode_out_b = nullptr; + bool audio_has_avgpool() const { return proj_type == PROJECTOR_TYPE_QWEN2A || proj_type == PROJECTOR_TYPE_VOXTRAL; diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index fee49e465c..bdc712bd45 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -842,6 +842,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { builder = std::make_unique(ctx, img); } break; + case PROJECTOR_TYPE_LFM2A: + { + builder = std::make_unique(ctx, img); + } break; default: GGML_ABORT("missing cgraph builder"); } @@ -1180,6 +1184,15 @@ struct clip_model_loader { hparams.audio_window_len = 400; hparams.audio_hop_len = 160; } break; + case PROJECTOR_TYPE_LFM2A: + { + // audio preprocessing params + hparams.audio_chunk_len = 1; // in seconds + hparams.audio_sample_rate = 16000; + hparams.audio_n_fft = 512; + hparams.audio_window_len = 400; + hparams.audio_hop_len = 160; + } break; default: break; } @@ -1587,6 +1600,52 @@ struct clip_model_loader { model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight")); model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias")); } break; + case PROJECTOR_TYPE_LFM2A: + { + for (int i : {0, 2, 3, 5, 6}) { + model.pre_encode_conv_X_w[i] = get_tensor(string_format(TN_CONV1D, i, "weight")); + model.pre_encode_conv_X_b[i] = get_tensor(string_format(TN_CONV1D, i, "bias")); + } + model.pre_encode_out_w = get_tensor(string_format(TN_PRE_ENCODE_OUT, "weight")); + model.pre_encode_out_b = get_tensor(string_format(TN_PRE_ENCODE_OUT, "bias")); + + model.mm_0_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 0, "weight")); + model.mm_0_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 0, "bias")); + model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight")); + model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias")); + model.mm_3_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 3, "weight")); + model.mm_3_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 3, "bias")); + + for (int il = 0; il < hparams.n_layer; ++il) { + auto & layer = model.layers[il]; + + layer.ff_norm_w = get_tensor(string_format(TN_FFN_NORM, prefix, il, "weight")); + layer.ff_norm_b = get_tensor(string_format(TN_FFN_NORM, prefix, il, "bias")); + layer.ff_norm_1_w = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "weight")); + layer.ff_norm_1_b = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "bias")); + layer.ff_up_1_w = get_tensor(string_format(TN_FFN_UP_1, prefix, il, "weight")); + layer.ff_up_1_b = get_tensor(string_format(TN_FFN_UP_1, prefix, il, "bias")); + layer.ff_down_1_w = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "weight")); + layer.ff_down_1_b = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "bias")); + + layer.pos_bias_u = get_tensor(string_format(TN_POS_BIAS_U, prefix, il)); + layer.pos_bias_v = get_tensor(string_format(TN_POS_BIAS_V, prefix, il)); + + layer.norm_conv_w = get_tensor(string_format(TN_NORM_CONV, prefix, il, "weight")); + layer.norm_conv_b = get_tensor(string_format(TN_NORM_CONV, prefix, il, "bias")); + + layer.linear_pos_w = get_tensor(string_format(TN_LINEAR_POS, prefix, il, "weight")); + + layer.conv_norm_w = get_tensor(string_format("convnext.%d.norm.%s", il, "weight")); + layer.conv_norm_b = get_tensor(string_format("convnext.%d.norm.%s", il, "bias")); + layer.conv_dw_w = get_tensor(string_format("convnext.%d.dw.%s", il, "weight")); + layer.conv_dw_b = get_tensor(string_format("convnext.%d.dw.%s", il, "bias")); + layer.conv_pw1_w = get_tensor(string_format("convnext.%d.pw1.%s", il, "weight")); + layer.conv_pw1_b = get_tensor(string_format("convnext.%d.pw1.%s", il, "bias")); + layer.conv_pw2_w = get_tensor(string_format("convnext.%d.pw2.%s", il, "weight")); + layer.conv_pw2_b = get_tensor(string_format("convnext.%d.pw2.%s", il, "bias")); + } + } break; default: GGML_ASSERT(false && "unknown projector type"); } @@ -2962,6 +3021,10 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im { n_patches += 2; // for BOI and EOI token embeddings } break; + case PROJECTOR_TYPE_LFM2A: + { + n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2; + } break; default: GGML_ABORT("unsupported projector type"); } @@ -3319,6 +3382,27 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } set_input_i32("pos_w", pos_data); } break; + case PROJECTOR_TYPE_LFM2A: + { + GGML_ASSERT(imgs.entries.size() == 1); + const auto n_frames = clip_n_output_tokens(ctx, imgs.entries.front().get()); + + auto d_model = 512; + auto seq_len = n_frames * 2 - 1; + std::vector pos_emb(d_model*seq_len); + std::vector inv_freq(d_model / 2); + for (size_t i = 0; i < inv_freq.size(); ++i) { + inv_freq[i] = std::exp(-(std::log(10000.0) / (float)d_model) * (2.0f * (float)(i))); + } + for (int64_t pos = 0; pos < seq_len; ++pos) { + for (size_t i = 0; i < inv_freq.size(); ++i) { + const float ang = (n_frames - pos - 1) * inv_freq[i]; + pos_emb[pos*d_model + 2*i + 0] = sinf(ang); // even + pos_emb[pos*d_model + 2*i + 1] = cosf(ang); // odd + } + } + set_input_f32("pos_emb", pos_emb); + } break; default: GGML_ABORT("Unknown projector type"); } @@ -3411,6 +3495,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { return ctx->model.mm_2_w->ne[1]; case PROJECTOR_TYPE_COGVLM: return ctx->model.mm_4h_to_h_w->ne[1]; + case PROJECTOR_TYPE_LFM2A: + return ctx->model.position_embeddings->ne[0]; default: GGML_ABORT("Unknown projector type"); } diff --git a/tools/mtmd/models/lfm2-audio-enc.cpp b/tools/mtmd/models/lfm2-audio-enc.cpp new file mode 100644 index 0000000000..c3869c52e2 --- /dev/null +++ b/tools/mtmd/models/lfm2-audio-enc.cpp @@ -0,0 +1,267 @@ +#include "models.h" + +ggml_cgraph * clip_graph_lfm2_audio_enc::build() { + const int n_frames = img.nx; + const int n_pos = n_frames / 2; + const int n_pos_embd = (((((n_frames + 1) / 2) + 1) / 2 + 1) / 2) * 2 - 1; + GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos); + + ggml_tensor * pos_emb = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 512, n_pos_embd); + ggml_set_name(pos_emb, "pos_emb"); + ggml_set_input(pos_emb); + ggml_build_forward_expand(gf, pos_emb); + + ggml_tensor * inp = build_inp_raw(1); + cb(inp, "input", -1); + + auto * cur = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); + + // pre encode, conv subsampling + { + // layer.0 - conv2d + cur = ggml_conv_2d(ctx0, model.pre_encode_conv_X_w[0], cur, 2, 2, 1, 1, 1, 1); + cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0, model.pre_encode_conv_X_b[0], 1, 1, cur->ne[2], 1)); + cb(cur, "conformer.pre_encode.conv.{}", 0); + + // layer.1 - relu + cur = ggml_relu_inplace(ctx0, cur); + + // layer.2 conv2d dw + cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[2], cur, 2, 2, 1, 1, 1, 1); + cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0, model.pre_encode_conv_X_b[2], 1, 1, cur->ne[2], 1)); + cb(cur, "conformer.pre_encode.conv.{}", 2); + + // layer.3 conv2d + cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[3], cur, 1, 1, 0, 0, 1, 1); + cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0, model.pre_encode_conv_X_b[3], 1, 1, cur->ne[2], 1)); + cb(cur, "conformer.pre_encode.conv.{}", 3); + + // layer.4 - relu + cur = ggml_relu_inplace(ctx0, cur); + + // layer.5 conv2d dw + cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[5], cur, 2, 2, 1, 1, 1, 1); + cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0, model.pre_encode_conv_X_b[5], 1, 1, cur->ne[2], 1)); + cb(cur, "conformer.pre_encode.conv.{}", 5); + + // layer.6 conv2d + cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[6], cur, 1, 1, 0, 0, 1, 1); + cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0, model.pre_encode_conv_X_b[6], 1, 1, cur->ne[2], 1)); + cb(cur, "conformer.pre_encode.conv.{}", 6); + + // layer.7 - relu + cur = ggml_relu_inplace(ctx0, cur); + + // flatten channel and frequency axis + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3)); + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2]); + + // calculate out + cur = ggml_mul_mat(ctx0, model.pre_encode_out_w, cur); + cur = ggml_add(ctx0, cur, model.pre_encode_out_b); + cb(cur, "conformer.pre_encode.out", -1); + } + + // pos_emb + cb(pos_emb, "pos_emb", -1); + + for (int il = 0; il < hparams.n_layer; il++) { + const auto & layer = model.layers[il]; + + auto * residual = cur; + + cb(cur, "layer.in", il); + + // feed_forward1 + cur = build_norm(cur, layer.ff_norm_w, layer.ff_norm_b, NORM_TYPE_NORMAL, 1e-5, il); + cb(cur, "conformer.layers.{}.norm_feed_forward1", il); + + cur = build_ffn(cur, + layer.ff_up_w, layer.ff_up_b, + nullptr, nullptr, + layer.ff_down_w, layer.ff_down_b, + FFN_SILU, il); + cb(cur, "conformer.layers.{}.feed_forward1.linear2", il); + + const auto fc_factor = 0.5f; + residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor)); + + // self-attention + { + cur = build_norm(residual, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, 1e-5, il); + cb(cur, "conformer.layers.{}.norm_self_att", il); + + cb(cur, "conformer.layers.{}.self_attn.id", il); + ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur); + Qcur = ggml_add(ctx0, Qcur, layer.q_b); + cb(Qcur, "conformer.layers.{}.self_attn.linear_q", il); + + ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur); + Kcur = ggml_add(ctx0, Kcur, layer.k_b); + cb(Kcur, "conformer.layers.{}.self_attn.linear_k", il); + + ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur); + Vcur = ggml_add(ctx0, Vcur, layer.v_b); + cb(Vcur, "conformer.layers.{}.self_attn.linear_v", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, Qcur->ne[1]); + Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, Kcur->ne[1]); + Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, Vcur->ne[1]); + + ggml_tensor * Q_bias_u = ggml_add(ctx0, Qcur, layer.pos_bias_u); + ggml_tensor * Q_bias_v = ggml_add(ctx0, Qcur, layer.pos_bias_v); + + Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); + Q_bias_u = ggml_cont(ctx0, ggml_permute(ctx0, Q_bias_u, 0, 2, 1, 3)); + ggml_tensor * matrix_ac = ggml_mul_mat(ctx0, Q_bias_u, Kcur); + matrix_ac = ggml_cont(ctx0, ggml_permute(ctx0, matrix_ac, 1, 0, 2, 3)); + cb(matrix_ac, "conformer.layers.{}.self_attn.id3", il); + + auto * p = ggml_mul_mat(ctx0, layer.linear_pos_w, pos_emb); + cb(p, "conformer.layers.{}.self_attn.linear_pos", il); + p = ggml_reshape_3d(ctx0, p, d_head, n_head, p->ne[1]); + + Q_bias_v = ggml_cont(ctx0, ggml_permute(ctx0, Q_bias_v, 0, 2, 1, 3)); + cb(Q_bias_v, "conformer.layers.{}.self_attn.id0", il); + p = ggml_cont(ctx0, ggml_permute(ctx0, p, 1, 2, 0, 3)); + cb(p, "conformer.layers.{}.self_attn.id1", il); + + p = ggml_cont(ctx0, ggml_permute(ctx0, p, 1, 0, 2, 3)); + auto * matrix_bd = ggml_mul_mat(ctx0, Q_bias_v, p); + matrix_bd = ggml_cont(ctx0, ggml_permute(ctx0, matrix_bd, 1, 0, 2, 3)); + + + // rel shift + { + const auto pos_len = matrix_bd->ne[0]; + const auto q_len = matrix_bd->ne[1]; + const auto h = matrix_bd->ne[2]; + matrix_bd = ggml_pad(ctx0, matrix_bd, 1, 0, 0, 0); + matrix_bd = ggml_roll(ctx0, matrix_bd, 1, 0, 0, 0); + matrix_bd = ggml_reshape_3d(ctx0, matrix_bd, q_len, pos_len + 1, h); + matrix_bd = ggml_cont(ctx0, ggml_view_3d(ctx0, matrix_bd, + q_len, pos_len, h, + matrix_bd->nb[1], matrix_bd->nb[2], matrix_bd->nb[0] * q_len)); + matrix_bd = ggml_reshape_3d(ctx0, matrix_bd, pos_len, q_len, h); + } + + matrix_bd = ggml_cont(ctx0, ggml_view_3d(ctx0, matrix_bd, + matrix_ac->ne[0], matrix_bd->ne[1], matrix_bd->ne[2], + matrix_bd->nb[1], matrix_bd->nb[2], 0)); + auto * scores = ggml_add(ctx0, matrix_ac, matrix_bd); + scores = ggml_scale(ctx0, scores, 1.0f / std::sqrt(d_head)); + cb(scores, "conformer.layers.{}.self_attn.id0", il); + + + ggml_tensor * attn = ggml_soft_max(ctx0, scores); + // TODO(tarek): combine permutes + Vcur = ggml_cont(ctx0, ggml_permute(ctx0, Vcur, 0, 2, 1, 3)); + Vcur = ggml_cont(ctx0, ggml_permute(ctx0, Vcur, 1, 0, 2, 3)); + ggml_tensor * x = ggml_mul_mat(ctx0, attn, Vcur); + // TODO(tarek): combine permutes + x = ggml_cont(ctx0, ggml_permute(ctx0, x, 1, 0, 2, 3)); + x = ggml_cont(ctx0, ggml_permute(ctx0, x, 0, 2, 1, 3)); + x = ggml_reshape_2d(ctx0, x, x->ne[0] * x->ne[1], x->ne[2]); + + x = ggml_mul_mat(ctx0, layer.o_w, x); + ggml_tensor * out = ggml_add(ctx0, x, layer.o_b); + cb(out, "conformer.layers.{}.self_attn.linear_out", il); + + cur = out; + } + + residual = ggml_add(ctx0, residual, cur); + cur = build_norm(residual, layer.norm_conv_w, layer.norm_conv_b, NORM_TYPE_NORMAL, 1e-5, il); + cb(cur, "conformer.layers.{}.norm_conv", il); + + // conv + { + auto * x = cur; + auto * conv_pw1_w = ggml_reshape_2d(ctx0, layer.conv_pw1_w, layer.conv_pw1_w->ne[1], layer.conv_pw1_w->ne[2]); + x = ggml_mul_mat(ctx0, conv_pw1_w, x); + x = ggml_add(ctx0, x, layer.conv_pw1_b); + x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); + cb(x, "conformer.layers.{}.conv.pointwise_conv1", il); + + x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); + + // TODO: add support of torch.funtional.glu + { + int64_t d = x->ne[0] / 2; + ggml_tensor *gate = ggml_sigmoid(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], d * x->nb[0])); + x = ggml_mul(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], 0), gate); + x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); + } + + // use ggml_ssm_conv for f32 precision + x = ggml_pad(ctx0, x, 4, 0, 0, 0); + x = ggml_roll(ctx0, x, 4, 0, 0, 0); + x = ggml_pad(ctx0, x, 4, 0, 0, 0); + x = ggml_cont(ctx0, x); + auto * conv_dw_w = ggml_reshape_2d(ctx0, layer.conv_dw_w, layer.conv_dw_w->ne[0], layer.conv_dw_w->ne[2]); + x = ggml_ssm_conv(ctx0, x, conv_dw_w); + x = ggml_add(ctx0, x, ggml_reshape_1d(ctx0, layer.conv_dw_b, layer.conv_dw_b->ne[0])); + x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); + + cb(x, "conformer.layers.{}.conv.depthwise_conv", il); + + { + x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); + x = ggml_add(ctx0, ggml_mul(ctx0, x, layer.conv_norm_w), layer.conv_norm_b); + x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); + cb(x, "conformer.layers.{}.conv.batch_norm", il); + } + x = ggml_silu(ctx0, x); + + // pointwise_conv2 + x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); + auto * conv_pw2_w = ggml_reshape_2d(ctx0, layer.conv_pw2_w, layer.conv_pw2_w->ne[1], layer.conv_pw2_w->ne[2]); + x = ggml_mul_mat(ctx0, conv_pw2_w, x); + x = ggml_add(ctx0, x, layer.conv_pw2_b); + x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); + cb(x, "conformer.layers.{}.conv.pointwise_conv2", il); + + x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); + cur = x; + } + + residual = ggml_add(ctx0, residual, cur); + + cur = build_norm(residual, layer.ff_norm_1_w, layer.ff_norm_1_b, NORM_TYPE_NORMAL, 1e-5, il); + cb(cur, "conformer.layers.{}.norm_feed_forward2", il); + + cur = build_ffn(cur, + layer.ff_up_1_w, layer.ff_up_1_b, + nullptr, nullptr, + layer.ff_down_1_w, layer.ff_down_1_b, + FFN_SILU, il); // TODO(tarek): read activation for ffn from hparams + cb(cur, "conformer.layers.{}.feed_forward2.linear2", il); + + residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor)); + cb(residual, "conformer.layers.{}.conv.id", il); + + cur = build_norm(residual, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, 1e-5, il); + cb(cur, "conformer.layers.{}.norm_out", il); + } + + // audio adapter + { + cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1); + cb(cur, "audio_adapter.model.{}", 0); + cur = ggml_mul_mat(ctx0, model.mm_1_w, cur); + cur = ggml_add(ctx0, cur, model.mm_1_b); + cb(cur, "audio_adapter.model.{}", 1); + cur = ggml_gelu_erf(ctx0, cur); + cb(cur, "audio_adapter.model.{}", 2); + cur = ggml_mul_mat(ctx0, model.mm_3_w, cur); + cur = ggml_add(ctx0, cur, model.mm_3_b); + cb(cur, "audio_adapter.model.{}", 3); + } + + cb(cur, "projected", -1); + + + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index 4b35da259c..46cf5ac8f8 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -56,3 +56,8 @@ struct clip_graph_whisper_enc : clip_graph { clip_graph_whisper_enc(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; }; + +struct clip_graph_lfm2_audio_enc : clip_graph { + clip_graph_lfm2_audio_enc(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp index f68829a61a..53c0d6ab9d 100644 --- a/tools/mtmd/mtmd-audio.cpp +++ b/tools/mtmd/mtmd-audio.cpp @@ -535,3 +535,55 @@ bool mtmd_audio_preprocessor_whisper::preprocess( return true; } + +// +// mtmd_audio_preprocessor_lfm2 +// +void mtmd_audio_preprocessor_lfm2::initialize() { + g_cache.fill_sin_cos_table(hparams.audio_n_fft); + g_cache.fill_hann_window(hparams.audio_window_len, true); + g_cache.fill_mel_filterbank_matrix( + hparams.n_mel_bins, + hparams.audio_n_fft, + hparams.audio_sample_rate); +} + +bool mtmd_audio_preprocessor_lfm2::preprocess( + const float * samples, + size_t n_samples, + std::vector & output) { + // empty audio + if (n_samples == 0) { + return false; + } + + filter_params params; + params.n_mel = hparams.n_mel_bins; + params.n_fft_bins = 1 + (hparams.audio_n_fft / 2); + params.hann_window_size = hparams.audio_window_len; + params.hop_length = hparams.audio_hop_len; + params.sample_rate = hparams.audio_sample_rate; + params.center_padding = true; + params.preemph = 0.97f; // disabled + params.use_natural_log = true; + params.norm_per_feature = true; + + // make sure the global cache is initialized + GGML_ASSERT(!g_cache.sin_vals.empty()); + GGML_ASSERT(!g_cache.cos_vals.empty()); + GGML_ASSERT(!g_cache.filters.data.empty()); + + mtmd_audio_mel out_full; + bool ok = log_mel_spectrogram( + samples, + n_samples, + 4, // n_threads + params, + out_full); + if (!ok) { + return false; + } + + output.push_back(std::move(out_full)); + return true; +} diff --git a/tools/mtmd/mtmd-audio.h b/tools/mtmd/mtmd-audio.h index 1b454337cb..ded0a30513 100644 --- a/tools/mtmd/mtmd-audio.h +++ b/tools/mtmd/mtmd-audio.h @@ -32,3 +32,9 @@ struct mtmd_audio_preprocessor_whisper : mtmd_audio_preprocessor { void initialize() override; bool preprocess(const float * samples, size_t n_samples, std::vector & output) override; }; + +struct mtmd_audio_preprocessor_lfm2: mtmd_audio_preprocessor { + mtmd_audio_preprocessor_lfm2(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {} + void initialize() override; + bool preprocess(const float * samples, size_t n_samples, std::vector & output) override; +}; diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index 332d2049e5..775d232c4a 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -311,9 +311,22 @@ int main(int argc, char ** argv) { if (g_is_interrupted) return 130; + auto eval_system_prompt_if_present = [&] { + if (params.system_prompt.empty()) { + return 0; + } + + common_chat_msg msg; + msg.role = "system"; + msg.content = params.system_prompt; + return eval_message(ctx, msg); + }; + LOG_WRN("WARN: This is an experimental CLI for testing multimodal capability.\n"); LOG_WRN(" For normal use cases, please use the standard llama-cli\n"); + eval_system_prompt_if_present(); + if (is_single_turn) { g_is_generating = true; if (params.prompt.find(mtmd_default_marker()) == std::string::npos) { @@ -323,6 +336,7 @@ int main(int argc, char ** argv) { params.prompt = mtmd_default_marker() + params.prompt; } } + common_chat_msg msg; msg.role = "user"; msg.content = params.prompt; @@ -372,6 +386,7 @@ int main(int argc, char ** argv) { ctx.chat_history.clear(); llama_memory_clear(llama_get_memory(ctx.lctx), true); LOG("Chat history cleared\n\n"); + eval_system_prompt_if_present(); continue; } g_is_generating = true; diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index c63f299cd9..15d3b67917 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -327,6 +327,9 @@ struct mtmd_context { case PROJECTOR_TYPE_VOXTRAL: audio_preproc = std::make_unique(ctx_a); break; + case PROJECTOR_TYPE_LFM2A: + audio_preproc = std::make_unique(ctx_a); + break; default: GGML_ABORT("unsupported audio projector type"); } From 4f5d5212b80583165cdbb651fec96ebaadec5339 Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Mon, 15 Dec 2025 14:43:03 +0100 Subject: [PATCH 2/8] Set rope_theta --- convert_hf_to_gguf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index a21ffc5da3..910d18dce8 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -9567,6 +9567,7 @@ class LFM2Model(TextModel): self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"]) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["norm_eps"]) + self.gguf_writer.add_rope_freq_base(self.hparams.get("rope_theta", 1000000)) self._add_feed_forward_length() def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: From 0e8779a54caad0fa1dbb652a31d92f9bf44005c7 Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Mon, 15 Dec 2025 16:30:04 +0100 Subject: [PATCH 3/8] Fix comment --- gguf-py/gguf/tensor_mapping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 1ae7f97260..03daa85799 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1131,7 +1131,7 @@ class TensorNameMap: MODEL_TENSOR.CONVNEXT_NORM: ( "backbone.convnext.{bid}.norm", # wavtokenizer - "conformer.layers.{bid}.conv.batch_norm", #lfm2 + "conformer.layers.{bid}.conv.batch_norm", # lfm2 ), MODEL_TENSOR.CONVNEXT_PW1: ( From f5b132a68c822270659efa1b72bdaea41c093800 Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Mon, 15 Dec 2025 20:03:00 +0100 Subject: [PATCH 4/8] Remove rope_theta setting --- convert_hf_to_gguf.py | 1 - 1 file changed, 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 910d18dce8..a21ffc5da3 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -9567,7 +9567,6 @@ class LFM2Model(TextModel): self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"]) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["norm_eps"]) - self.gguf_writer.add_rope_freq_base(self.hparams.get("rope_theta", 1000000)) self._add_feed_forward_length() def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: From ba9e59739ccb88ce62dd96b7df07f35f88bab36c Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Mon, 15 Dec 2025 21:24:32 +0100 Subject: [PATCH 5/8] Address PR feedback --- convert_hf_to_gguf.py | 5 + tools/mtmd/models/lfm2-audio-enc.cpp | 172 ++++++++++----------------- 2 files changed, 68 insertions(+), 109 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index a21ffc5da3..223d615a6d 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -9706,6 +9706,7 @@ class LFM2AudioModel(MmprojModel): return self.global_config.get("encoder") def set_gguf_parameters(self): + assert self.hparams_audio is not None self.hparams_audio["hidden_size"] = self.hparams_audio["d_model"] self.hparams_audio["intermediate_size"] = self.hparams_audio["d_model"] self.hparams_audio["num_attention_heads"] = self.hparams_audio["n_heads"] @@ -9755,6 +9756,10 @@ class LFM2AudioModel(MmprojModel): (self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.bias"), b), ] + # reshape conv weights + if name.startswith("conformer.pre_encode.conv.") and name.endswith(".bias"): + data_torch = data_torch[:, None, None] + return [(self.map_tensor_name(name), data_torch)] diff --git a/tools/mtmd/models/lfm2-audio-enc.cpp b/tools/mtmd/models/lfm2-audio-enc.cpp index c3869c52e2..831099f8eb 100644 --- a/tools/mtmd/models/lfm2-audio-enc.cpp +++ b/tools/mtmd/models/lfm2-audio-enc.cpp @@ -1,8 +1,8 @@ #include "models.h" ggml_cgraph * clip_graph_lfm2_audio_enc::build() { - const int n_frames = img.nx; - const int n_pos = n_frames / 2; + const int n_frames = img.nx; + const int n_pos = n_frames / 2; const int n_pos_embd = (((((n_frames + 1) / 2) + 1) / 2 + 1) / 2) * 2 - 1; GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos); @@ -20,7 +20,7 @@ ggml_cgraph * clip_graph_lfm2_audio_enc::build() { { // layer.0 - conv2d cur = ggml_conv_2d(ctx0, model.pre_encode_conv_X_w[0], cur, 2, 2, 1, 1, 1, 1); - cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0, model.pre_encode_conv_X_b[0], 1, 1, cur->ne[2], 1)); + cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[0]); cb(cur, "conformer.pre_encode.conv.{}", 0); // layer.1 - relu @@ -28,12 +28,12 @@ ggml_cgraph * clip_graph_lfm2_audio_enc::build() { // layer.2 conv2d dw cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[2], cur, 2, 2, 1, 1, 1, 1); - cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0, model.pre_encode_conv_X_b[2], 1, 1, cur->ne[2], 1)); + cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[2]); cb(cur, "conformer.pre_encode.conv.{}", 2); // layer.3 conv2d cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[3], cur, 1, 1, 0, 0, 1, 1); - cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0, model.pre_encode_conv_X_b[3], 1, 1, cur->ne[2], 1)); + cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[3]); cb(cur, "conformer.pre_encode.conv.{}", 3); // layer.4 - relu @@ -41,12 +41,12 @@ ggml_cgraph * clip_graph_lfm2_audio_enc::build() { // layer.5 conv2d dw cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[5], cur, 2, 2, 1, 1, 1, 1); - cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0, model.pre_encode_conv_X_b[5], 1, 1, cur->ne[2], 1)); + cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[5]); cb(cur, "conformer.pre_encode.conv.{}", 5); // layer.6 conv2d cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[6], cur, 1, 1, 0, 0, 1, 1); - cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0, model.pre_encode_conv_X_b[6], 1, 1, cur->ne[2], 1)); + cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[6]); cb(cur, "conformer.pre_encode.conv.{}", 6); // layer.7 - relu @@ -76,94 +76,74 @@ ggml_cgraph * clip_graph_lfm2_audio_enc::build() { cur = build_norm(cur, layer.ff_norm_w, layer.ff_norm_b, NORM_TYPE_NORMAL, 1e-5, il); cb(cur, "conformer.layers.{}.norm_feed_forward1", il); - cur = build_ffn(cur, - layer.ff_up_w, layer.ff_up_b, - nullptr, nullptr, - layer.ff_down_w, layer.ff_down_b, - FFN_SILU, il); + cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b, FFN_SILU, + il); cb(cur, "conformer.layers.{}.feed_forward1.linear2", il); const auto fc_factor = 0.5f; - residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor)); + residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor)); // self-attention { cur = build_norm(residual, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, 1e-5, il); cb(cur, "conformer.layers.{}.norm_self_att", il); - cb(cur, "conformer.layers.{}.self_attn.id", il); - ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur); - Qcur = ggml_add(ctx0, Qcur, layer.q_b); - cb(Qcur, "conformer.layers.{}.self_attn.linear_q", il); + ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur); + Qcur = ggml_add(ctx0, Qcur, layer.q_b); + Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, Qcur->ne[1]); + ggml_tensor * Q_bias_u = ggml_add(ctx0, Qcur, layer.pos_bias_u); + Q_bias_u = ggml_cont(ctx0, ggml_permute(ctx0, Q_bias_u, 0, 2, 1, 3)); + ggml_tensor * Q_bias_v = ggml_add(ctx0, Qcur, layer.pos_bias_v); + Q_bias_v = ggml_cont(ctx0, ggml_permute(ctx0, Q_bias_v, 0, 2, 1, 3)); ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur); - Kcur = ggml_add(ctx0, Kcur, layer.k_b); - cb(Kcur, "conformer.layers.{}.self_attn.linear_k", il); + Kcur = ggml_add(ctx0, Kcur, layer.k_b); + Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, Kcur->ne[1]); + Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur); - Vcur = ggml_add(ctx0, Vcur, layer.v_b); - cb(Vcur, "conformer.layers.{}.self_attn.linear_v", il); + Vcur = ggml_add(ctx0, Vcur, layer.v_b); + Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, Vcur->ne[1]); + Vcur = ggml_cont(ctx0, ggml_permute(ctx0, Vcur, 1, 2, 0, 3)); - Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, Qcur->ne[1]); - Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, Kcur->ne[1]); - Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, Vcur->ne[1]); - - ggml_tensor * Q_bias_u = ggml_add(ctx0, Qcur, layer.pos_bias_u); - ggml_tensor * Q_bias_v = ggml_add(ctx0, Qcur, layer.pos_bias_v); - - Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); - Q_bias_u = ggml_cont(ctx0, ggml_permute(ctx0, Q_bias_u, 0, 2, 1, 3)); + // build_attn won't fit due to matrix_ac and matrix_bd separation ggml_tensor * matrix_ac = ggml_mul_mat(ctx0, Q_bias_u, Kcur); - matrix_ac = ggml_cont(ctx0, ggml_permute(ctx0, matrix_ac, 1, 0, 2, 3)); + matrix_ac = ggml_cont(ctx0, ggml_permute(ctx0, matrix_ac, 1, 0, 2, 3)); cb(matrix_ac, "conformer.layers.{}.self_attn.id3", il); auto * p = ggml_mul_mat(ctx0, layer.linear_pos_w, pos_emb); cb(p, "conformer.layers.{}.self_attn.linear_pos", il); p = ggml_reshape_3d(ctx0, p, d_head, n_head, p->ne[1]); + p = ggml_cont(ctx0, ggml_permute(ctx0, p, 0, 2, 1, 3)); - Q_bias_v = ggml_cont(ctx0, ggml_permute(ctx0, Q_bias_v, 0, 2, 1, 3)); - cb(Q_bias_v, "conformer.layers.{}.self_attn.id0", il); - p = ggml_cont(ctx0, ggml_permute(ctx0, p, 1, 2, 0, 3)); - cb(p, "conformer.layers.{}.self_attn.id1", il); - - p = ggml_cont(ctx0, ggml_permute(ctx0, p, 1, 0, 2, 3)); auto * matrix_bd = ggml_mul_mat(ctx0, Q_bias_v, p); - matrix_bd = ggml_cont(ctx0, ggml_permute(ctx0, matrix_bd, 1, 0, 2, 3)); - + matrix_bd = ggml_cont(ctx0, ggml_permute(ctx0, matrix_bd, 1, 0, 2, 3)); // rel shift { const auto pos_len = matrix_bd->ne[0]; - const auto q_len = matrix_bd->ne[1]; + const auto q_len = matrix_bd->ne[1]; const auto h = matrix_bd->ne[2]; - matrix_bd = ggml_pad(ctx0, matrix_bd, 1, 0, 0, 0); - matrix_bd = ggml_roll(ctx0, matrix_bd, 1, 0, 0, 0); - matrix_bd = ggml_reshape_3d(ctx0, matrix_bd, q_len, pos_len + 1, h); - matrix_bd = ggml_cont(ctx0, ggml_view_3d(ctx0, matrix_bd, - q_len, pos_len, h, - matrix_bd->nb[1], matrix_bd->nb[2], matrix_bd->nb[0] * q_len)); - matrix_bd = ggml_reshape_3d(ctx0, matrix_bd, pos_len, q_len, h); + matrix_bd = ggml_pad(ctx0, matrix_bd, 1, 0, 0, 0); + matrix_bd = ggml_roll(ctx0, matrix_bd, 1, 0, 0, 0); + matrix_bd = ggml_reshape_3d(ctx0, matrix_bd, q_len, pos_len + 1, h); + matrix_bd = ggml_cont(ctx0, ggml_view_3d(ctx0, matrix_bd, q_len, pos_len, h, matrix_bd->nb[1], + matrix_bd->nb[2], matrix_bd->nb[0] * q_len)); + matrix_bd = ggml_reshape_3d(ctx0, matrix_bd, pos_len, q_len, h); } - matrix_bd = ggml_cont(ctx0, ggml_view_3d(ctx0, matrix_bd, - matrix_ac->ne[0], matrix_bd->ne[1], matrix_bd->ne[2], - matrix_bd->nb[1], matrix_bd->nb[2], 0)); + matrix_bd = ggml_cont(ctx0, ggml_view_3d(ctx0, matrix_bd, matrix_ac->ne[0], matrix_bd->ne[1], + matrix_bd->ne[2], matrix_bd->nb[1], matrix_bd->nb[2], 0)); auto * scores = ggml_add(ctx0, matrix_ac, matrix_bd); - scores = ggml_scale(ctx0, scores, 1.0f / std::sqrt(d_head)); + scores = ggml_scale(ctx0, scores, 1.0f / std::sqrt(d_head)); cb(scores, "conformer.layers.{}.self_attn.id0", il); - ggml_tensor * attn = ggml_soft_max(ctx0, scores); - // TODO(tarek): combine permutes - Vcur = ggml_cont(ctx0, ggml_permute(ctx0, Vcur, 0, 2, 1, 3)); - Vcur = ggml_cont(ctx0, ggml_permute(ctx0, Vcur, 1, 0, 2, 3)); - ggml_tensor * x = ggml_mul_mat(ctx0, attn, Vcur); - // TODO(tarek): combine permutes - x = ggml_cont(ctx0, ggml_permute(ctx0, x, 1, 0, 2, 3)); - x = ggml_cont(ctx0, ggml_permute(ctx0, x, 0, 2, 1, 3)); - x = ggml_reshape_2d(ctx0, x, x->ne[0] * x->ne[1], x->ne[2]); + ggml_tensor * x = ggml_mul_mat(ctx0, attn, Vcur); + x = ggml_cont(ctx0, ggml_permute(ctx0, x, 2, 0, 1, 3)); + x = ggml_reshape_2d(ctx0, x, x->ne[0] * x->ne[1], x->ne[2]); - x = ggml_mul_mat(ctx0, layer.o_w, x); + x = ggml_mul_mat(ctx0, layer.o_w, x); ggml_tensor * out = ggml_add(ctx0, x, layer.o_b); cb(out, "conformer.layers.{}.self_attn.linear_out", il); @@ -171,57 +151,44 @@ ggml_cgraph * clip_graph_lfm2_audio_enc::build() { } residual = ggml_add(ctx0, residual, cur); - cur = build_norm(residual, layer.norm_conv_w, layer.norm_conv_b, NORM_TYPE_NORMAL, 1e-5, il); + cur = build_norm(residual, layer.norm_conv_w, layer.norm_conv_b, NORM_TYPE_NORMAL, 1e-5, il); cb(cur, "conformer.layers.{}.norm_conv", il); // conv { auto * x = cur; - auto * conv_pw1_w = ggml_reshape_2d(ctx0, layer.conv_pw1_w, layer.conv_pw1_w->ne[1], layer.conv_pw1_w->ne[2]); + auto * conv_pw1_w = + ggml_reshape_2d(ctx0, layer.conv_pw1_w, layer.conv_pw1_w->ne[1], layer.conv_pw1_w->ne[2]); x = ggml_mul_mat(ctx0, conv_pw1_w, x); x = ggml_add(ctx0, x, layer.conv_pw1_b); - x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); cb(x, "conformer.layers.{}.conv.pointwise_conv1", il); - x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); - - // TODO: add support of torch.funtional.glu + // ggml_glu doesn't support sigmoid { - int64_t d = x->ne[0] / 2; - ggml_tensor *gate = ggml_sigmoid(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], d * x->nb[0])); - x = ggml_mul(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], 0), gate); - x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); + int64_t d = x->ne[0] / 2; + ggml_tensor * gate = ggml_sigmoid(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], d * x->nb[0])); + x = ggml_mul(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], 0), gate); + x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); } // use ggml_ssm_conv for f32 precision - x = ggml_pad(ctx0, x, 4, 0, 0, 0); - x = ggml_roll(ctx0, x, 4, 0, 0, 0); - x = ggml_pad(ctx0, x, 4, 0, 0, 0); - x = ggml_cont(ctx0, x); + x = ggml_pad(ctx0, x, 4, 0, 0, 0); + x = ggml_roll(ctx0, x, 4, 0, 0, 0); + x = ggml_pad(ctx0, x, 4, 0, 0, 0); + x = ggml_cont(ctx0, x); auto * conv_dw_w = ggml_reshape_2d(ctx0, layer.conv_dw_w, layer.conv_dw_w->ne[0], layer.conv_dw_w->ne[2]); - x = ggml_ssm_conv(ctx0, x, conv_dw_w); - x = ggml_add(ctx0, x, ggml_reshape_1d(ctx0, layer.conv_dw_b, layer.conv_dw_b->ne[0])); - x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); + x = ggml_ssm_conv(ctx0, x, conv_dw_w); + x = ggml_add(ctx0, x, ggml_reshape_1d(ctx0, layer.conv_dw_b, layer.conv_dw_b->ne[0])); - cb(x, "conformer.layers.{}.conv.depthwise_conv", il); - - { - x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); - x = ggml_add(ctx0, ggml_mul(ctx0, x, layer.conv_norm_w), layer.conv_norm_b); - x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); - cb(x, "conformer.layers.{}.conv.batch_norm", il); - } + x = ggml_add(ctx0, ggml_mul(ctx0, x, layer.conv_norm_w), layer.conv_norm_b); x = ggml_silu(ctx0, x); // pointwise_conv2 - x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); - auto * conv_pw2_w = ggml_reshape_2d(ctx0, layer.conv_pw2_w, layer.conv_pw2_w->ne[1], layer.conv_pw2_w->ne[2]); + auto * conv_pw2_w = + ggml_reshape_2d(ctx0, layer.conv_pw2_w, layer.conv_pw2_w->ne[1], layer.conv_pw2_w->ne[2]); x = ggml_mul_mat(ctx0, conv_pw2_w, x); x = ggml_add(ctx0, x, layer.conv_pw2_b); - x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); - cb(x, "conformer.layers.{}.conv.pointwise_conv2", il); - x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); cur = x; } @@ -230,11 +197,8 @@ ggml_cgraph * clip_graph_lfm2_audio_enc::build() { cur = build_norm(residual, layer.ff_norm_1_w, layer.ff_norm_1_b, NORM_TYPE_NORMAL, 1e-5, il); cb(cur, "conformer.layers.{}.norm_feed_forward2", il); - cur = build_ffn(cur, - layer.ff_up_1_w, layer.ff_up_1_b, - nullptr, nullptr, - layer.ff_down_1_w, layer.ff_down_1_b, - FFN_SILU, il); // TODO(tarek): read activation for ffn from hparams + cur = build_ffn(cur, layer.ff_up_1_w, layer.ff_up_1_b, nullptr, nullptr, layer.ff_down_1_w, layer.ff_down_1_b, + FFN_SILU, il); // TODO(tarek): read activation for ffn from hparams cb(cur, "conformer.layers.{}.feed_forward2.linear2", il); residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor)); @@ -245,22 +209,12 @@ ggml_cgraph * clip_graph_lfm2_audio_enc::build() { } // audio adapter - { - cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1); - cb(cur, "audio_adapter.model.{}", 0); - cur = ggml_mul_mat(ctx0, model.mm_1_w, cur); - cur = ggml_add(ctx0, cur, model.mm_1_b); - cb(cur, "audio_adapter.model.{}", 1); - cur = ggml_gelu_erf(ctx0, cur); - cb(cur, "audio_adapter.model.{}", 2); - cur = ggml_mul_mat(ctx0, model.mm_3_w, cur); - cur = ggml_add(ctx0, cur, model.mm_3_b); - cb(cur, "audio_adapter.model.{}", 3); - } + cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1); + cb(cur, "audio_adapter.model.{}", 0); + cur = build_ffn(cur, model.mm_1_w, model.mm_1_b, nullptr, nullptr, model.mm_3_w, model.mm_3_b, FFN_GELU_ERF, -1); cb(cur, "projected", -1); - ggml_build_forward_expand(gf, cur); return gf; From cea578bc8ceed474ef335cbe6d142724a282b186 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 16 Dec 2025 16:58:00 +0100 Subject: [PATCH 6/8] rename functions to conformer --- tools/mtmd/CMakeLists.txt | 2 +- tools/mtmd/clip-impl.h | 2 +- tools/mtmd/clip.cpp | 2 +- tools/mtmd/models/{lfm2-audio-enc.cpp => conformer.cpp} | 2 +- tools/mtmd/models/models.h | 4 ++-- tools/mtmd/mtmd-audio.cpp | 7 ++++--- tools/mtmd/mtmd-audio.h | 4 ++-- tools/mtmd/mtmd.cpp | 2 +- 8 files changed, 13 insertions(+), 12 deletions(-) rename tools/mtmd/models/{lfm2-audio-enc.cpp => conformer.cpp} (99%) diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index 14a5bac07b..dd7590086e 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -15,9 +15,9 @@ add_library(mtmd clip-graph.h models/models.h models/cogvlm.cpp + models/conformer.cpp models/internvl.cpp models/kimivl.cpp - models/lfm2-audio-enc.cpp models/llama4.cpp models/llava.cpp models/minicpmv.cpp diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 4411478459..25c3abbf9c 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -133,7 +133,7 @@ #define TN_TOK_BOI "v.boi" #define TN_TOK_EOI "v.eoi" -// lfm2 +// (conformer) lfm2 #define TN_PRE_ENCODE_OUT "a.pre_encode.out.%s" #define TN_FFN_NORM "%s.blk.%d.ffn_norm.%s" #define TN_FFN_NORM_1 "%s.blk.%d.ffn_norm_1.%s" diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index bdc712bd45..ea6f3ff06d 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -844,7 +844,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 } break; case PROJECTOR_TYPE_LFM2A: { - builder = std::make_unique(ctx, img); + builder = std::make_unique(ctx, img); } break; default: GGML_ABORT("missing cgraph builder"); diff --git a/tools/mtmd/models/lfm2-audio-enc.cpp b/tools/mtmd/models/conformer.cpp similarity index 99% rename from tools/mtmd/models/lfm2-audio-enc.cpp rename to tools/mtmd/models/conformer.cpp index 831099f8eb..aeaeb79fac 100644 --- a/tools/mtmd/models/lfm2-audio-enc.cpp +++ b/tools/mtmd/models/conformer.cpp @@ -1,6 +1,6 @@ #include "models.h" -ggml_cgraph * clip_graph_lfm2_audio_enc::build() { +ggml_cgraph * clip_graph_conformer::build() { const int n_frames = img.nx; const int n_pos = n_frames / 2; const int n_pos_embd = (((((n_frames + 1) / 2) + 1) / 2 + 1) / 2) * 2 - 1; diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index 46cf5ac8f8..4935e92f15 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -57,7 +57,7 @@ struct clip_graph_whisper_enc : clip_graph { ggml_cgraph * build() override; }; -struct clip_graph_lfm2_audio_enc : clip_graph { - clip_graph_lfm2_audio_enc(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} +struct clip_graph_conformer : clip_graph { + clip_graph_conformer(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; }; diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp index 53c0d6ab9d..bf68847da4 100644 --- a/tools/mtmd/mtmd-audio.cpp +++ b/tools/mtmd/mtmd-audio.cpp @@ -537,9 +537,10 @@ bool mtmd_audio_preprocessor_whisper::preprocess( } // -// mtmd_audio_preprocessor_lfm2 +// mtmd_audio_preprocessor_conformer // -void mtmd_audio_preprocessor_lfm2::initialize() { + +void mtmd_audio_preprocessor_conformer::initialize() { g_cache.fill_sin_cos_table(hparams.audio_n_fft); g_cache.fill_hann_window(hparams.audio_window_len, true); g_cache.fill_mel_filterbank_matrix( @@ -548,7 +549,7 @@ void mtmd_audio_preprocessor_lfm2::initialize() { hparams.audio_sample_rate); } -bool mtmd_audio_preprocessor_lfm2::preprocess( +bool mtmd_audio_preprocessor_conformer::preprocess( const float * samples, size_t n_samples, std::vector & output) { diff --git a/tools/mtmd/mtmd-audio.h b/tools/mtmd/mtmd-audio.h index ded0a30513..d484c9d030 100644 --- a/tools/mtmd/mtmd-audio.h +++ b/tools/mtmd/mtmd-audio.h @@ -33,8 +33,8 @@ struct mtmd_audio_preprocessor_whisper : mtmd_audio_preprocessor { bool preprocess(const float * samples, size_t n_samples, std::vector & output) override; }; -struct mtmd_audio_preprocessor_lfm2: mtmd_audio_preprocessor { - mtmd_audio_preprocessor_lfm2(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {} +struct mtmd_audio_preprocessor_conformer : mtmd_audio_preprocessor { + mtmd_audio_preprocessor_conformer(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {} void initialize() override; bool preprocess(const float * samples, size_t n_samples, std::vector & output) override; }; diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 15d3b67917..1a829ed4e5 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -328,7 +328,7 @@ struct mtmd_context { audio_preproc = std::make_unique(ctx_a); break; case PROJECTOR_TYPE_LFM2A: - audio_preproc = std::make_unique(ctx_a); + audio_preproc = std::make_unique(ctx_a); break; default: GGML_ABORT("unsupported audio projector type"); From a3ebc93d7193dc72e73b9602047fc34e3065cc3b Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 16 Dec 2025 17:20:13 +0100 Subject: [PATCH 7/8] remove some redundant ggml_cont --- tools/mtmd/models/conformer.cpp | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/tools/mtmd/models/conformer.cpp b/tools/mtmd/models/conformer.cpp index aeaeb79fac..abed3bbf43 100644 --- a/tools/mtmd/models/conformer.cpp +++ b/tools/mtmd/models/conformer.cpp @@ -92,10 +92,11 @@ ggml_cgraph * clip_graph_conformer::build() { Qcur = ggml_add(ctx0, Qcur, layer.q_b); Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, Qcur->ne[1]); ggml_tensor * Q_bias_u = ggml_add(ctx0, Qcur, layer.pos_bias_u); - Q_bias_u = ggml_cont(ctx0, ggml_permute(ctx0, Q_bias_u, 0, 2, 1, 3)); + Q_bias_u = ggml_permute(ctx0, Q_bias_u, 0, 2, 1, 3); ggml_tensor * Q_bias_v = ggml_add(ctx0, Qcur, layer.pos_bias_v); - Q_bias_v = ggml_cont(ctx0, ggml_permute(ctx0, Q_bias_v, 0, 2, 1, 3)); + Q_bias_v = ggml_permute(ctx0, Q_bias_v, 0, 2, 1, 3); + // TODO @ngxson : some cont can/should be removed when ggml_mul_mat support these cases ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur); Kcur = ggml_add(ctx0, Kcur, layer.k_b); Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, Kcur->ne[1]); @@ -114,7 +115,7 @@ ggml_cgraph * clip_graph_conformer::build() { auto * p = ggml_mul_mat(ctx0, layer.linear_pos_w, pos_emb); cb(p, "conformer.layers.{}.self_attn.linear_pos", il); p = ggml_reshape_3d(ctx0, p, d_head, n_head, p->ne[1]); - p = ggml_cont(ctx0, ggml_permute(ctx0, p, 0, 2, 1, 3)); + p = ggml_permute(ctx0, p, 0, 2, 1, 3); auto * matrix_bd = ggml_mul_mat(ctx0, Q_bias_v, p); matrix_bd = ggml_cont(ctx0, ggml_permute(ctx0, matrix_bd, 1, 0, 2, 3)); @@ -127,24 +128,24 @@ ggml_cgraph * clip_graph_conformer::build() { matrix_bd = ggml_pad(ctx0, matrix_bd, 1, 0, 0, 0); matrix_bd = ggml_roll(ctx0, matrix_bd, 1, 0, 0, 0); matrix_bd = ggml_reshape_3d(ctx0, matrix_bd, q_len, pos_len + 1, h); - matrix_bd = ggml_cont(ctx0, ggml_view_3d(ctx0, matrix_bd, q_len, pos_len, h, matrix_bd->nb[1], - matrix_bd->nb[2], matrix_bd->nb[0] * q_len)); - matrix_bd = ggml_reshape_3d(ctx0, matrix_bd, pos_len, q_len, h); + matrix_bd = ggml_view_3d(ctx0, matrix_bd, q_len, pos_len, h, matrix_bd->nb[1], + matrix_bd->nb[2], matrix_bd->nb[0] * q_len); + matrix_bd = ggml_cont_3d(ctx0, matrix_bd, pos_len, q_len, h); } - matrix_bd = ggml_cont(ctx0, ggml_view_3d(ctx0, matrix_bd, matrix_ac->ne[0], matrix_bd->ne[1], - matrix_bd->ne[2], matrix_bd->nb[1], matrix_bd->nb[2], 0)); + matrix_bd = ggml_view_3d(ctx0, matrix_bd, matrix_ac->ne[0], matrix_bd->ne[1], + matrix_bd->ne[2], matrix_bd->nb[1], matrix_bd->nb[2], 0); auto * scores = ggml_add(ctx0, matrix_ac, matrix_bd); scores = ggml_scale(ctx0, scores, 1.0f / std::sqrt(d_head)); cb(scores, "conformer.layers.{}.self_attn.id0", il); ggml_tensor * attn = ggml_soft_max(ctx0, scores); ggml_tensor * x = ggml_mul_mat(ctx0, attn, Vcur); - x = ggml_cont(ctx0, ggml_permute(ctx0, x, 2, 0, 1, 3)); - x = ggml_reshape_2d(ctx0, x, x->ne[0] * x->ne[1], x->ne[2]); + x = ggml_permute(ctx0, x, 2, 0, 1, 3); + x = ggml_cont_2d(ctx0, x, x->ne[0] * x->ne[1], x->ne[2]); - x = ggml_mul_mat(ctx0, layer.o_w, x); - ggml_tensor * out = ggml_add(ctx0, x, layer.o_b); + ggml_tensor * out = ggml_mul_mat(ctx0, layer.o_w, x); + out = ggml_add(ctx0, out, layer.o_b); cb(out, "conformer.layers.{}.self_attn.linear_out", il); cur = out; @@ -164,6 +165,7 @@ ggml_cgraph * clip_graph_conformer::build() { cb(x, "conformer.layers.{}.conv.pointwise_conv1", il); // ggml_glu doesn't support sigmoid + // TODO @ngxson : support this ops in ggml { int64_t d = x->ne[0] / 2; ggml_tensor * gate = ggml_sigmoid(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], d * x->nb[0])); @@ -175,7 +177,6 @@ ggml_cgraph * clip_graph_conformer::build() { x = ggml_pad(ctx0, x, 4, 0, 0, 0); x = ggml_roll(ctx0, x, 4, 0, 0, 0); x = ggml_pad(ctx0, x, 4, 0, 0, 0); - x = ggml_cont(ctx0, x); auto * conv_dw_w = ggml_reshape_2d(ctx0, layer.conv_dw_w, layer.conv_dw_w->ne[0], layer.conv_dw_w->ne[2]); x = ggml_ssm_conv(ctx0, x, conv_dw_w); x = ggml_add(ctx0, x, ggml_reshape_1d(ctx0, layer.conv_dw_b, layer.conv_dw_b->ne[0])); From 72a41fd96093f9b9f6645108e4a519e036577f86 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 16 Dec 2025 17:34:20 +0100 Subject: [PATCH 8/8] fix missing tensor --- src/llama-model.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index ae8207ee1a..c9a3c5dfa2 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -6236,8 +6236,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); - output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); - output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM_LFM2, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); if (output == NULL) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);