diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index 14a5bac07b..dd7590086e 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -15,9 +15,9 @@ add_library(mtmd clip-graph.h models/models.h models/cogvlm.cpp + models/conformer.cpp models/internvl.cpp models/kimivl.cpp - models/lfm2-audio-enc.cpp models/llama4.cpp models/llava.cpp models/minicpmv.cpp diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 4411478459..25c3abbf9c 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -133,7 +133,7 @@ #define TN_TOK_BOI "v.boi" #define TN_TOK_EOI "v.eoi" -// lfm2 +// (conformer) lfm2 #define TN_PRE_ENCODE_OUT "a.pre_encode.out.%s" #define TN_FFN_NORM "%s.blk.%d.ffn_norm.%s" #define TN_FFN_NORM_1 "%s.blk.%d.ffn_norm_1.%s" diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index bdc712bd45..ea6f3ff06d 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -844,7 +844,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 } break; case PROJECTOR_TYPE_LFM2A: { - builder = std::make_unique(ctx, img); + builder = std::make_unique(ctx, img); } break; default: GGML_ABORT("missing cgraph builder"); diff --git a/tools/mtmd/models/lfm2-audio-enc.cpp b/tools/mtmd/models/conformer.cpp similarity index 99% rename from tools/mtmd/models/lfm2-audio-enc.cpp rename to tools/mtmd/models/conformer.cpp index 831099f8eb..aeaeb79fac 100644 --- a/tools/mtmd/models/lfm2-audio-enc.cpp +++ b/tools/mtmd/models/conformer.cpp @@ -1,6 +1,6 @@ #include "models.h" -ggml_cgraph * clip_graph_lfm2_audio_enc::build() { +ggml_cgraph * clip_graph_conformer::build() { const int n_frames = img.nx; const int n_pos = n_frames / 2; const int n_pos_embd = (((((n_frames + 1) / 2) + 1) / 2 + 1) / 2) * 2 - 1; diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index 46cf5ac8f8..4935e92f15 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -57,7 +57,7 @@ struct clip_graph_whisper_enc : clip_graph { ggml_cgraph * build() override; }; -struct clip_graph_lfm2_audio_enc : clip_graph { - clip_graph_lfm2_audio_enc(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} +struct clip_graph_conformer : clip_graph { + clip_graph_conformer(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; }; diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp index 53c0d6ab9d..bf68847da4 100644 --- a/tools/mtmd/mtmd-audio.cpp +++ b/tools/mtmd/mtmd-audio.cpp @@ -537,9 +537,10 @@ bool mtmd_audio_preprocessor_whisper::preprocess( } // -// mtmd_audio_preprocessor_lfm2 +// mtmd_audio_preprocessor_conformer // -void mtmd_audio_preprocessor_lfm2::initialize() { + +void mtmd_audio_preprocessor_conformer::initialize() { g_cache.fill_sin_cos_table(hparams.audio_n_fft); g_cache.fill_hann_window(hparams.audio_window_len, true); g_cache.fill_mel_filterbank_matrix( @@ -548,7 +549,7 @@ void mtmd_audio_preprocessor_lfm2::initialize() { hparams.audio_sample_rate); } -bool mtmd_audio_preprocessor_lfm2::preprocess( +bool mtmd_audio_preprocessor_conformer::preprocess( const float * samples, size_t n_samples, std::vector & output) { diff --git a/tools/mtmd/mtmd-audio.h b/tools/mtmd/mtmd-audio.h index ded0a30513..d484c9d030 100644 --- a/tools/mtmd/mtmd-audio.h +++ b/tools/mtmd/mtmd-audio.h @@ -33,8 +33,8 @@ struct mtmd_audio_preprocessor_whisper : mtmd_audio_preprocessor { bool preprocess(const float * samples, size_t n_samples, std::vector & output) override; }; -struct mtmd_audio_preprocessor_lfm2: mtmd_audio_preprocessor { - mtmd_audio_preprocessor_lfm2(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {} +struct mtmd_audio_preprocessor_conformer : mtmd_audio_preprocessor { + mtmd_audio_preprocessor_conformer(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {} void initialize() override; bool preprocess(const float * samples, size_t n_samples, std::vector & output) override; }; diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 15d3b67917..1a829ed4e5 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -328,7 +328,7 @@ struct mtmd_context { audio_preproc = std::make_unique(ctx_a); break; case PROJECTOR_TYPE_LFM2A: - audio_preproc = std::make_unique(ctx_a); + audio_preproc = std::make_unique(ctx_a); break; default: GGML_ABORT("unsupported audio projector type");