rename functions to conformer
This commit is contained in:
parent
ba9e59739c
commit
cea578bc8c
|
|
@ -15,9 +15,9 @@ add_library(mtmd
|
||||||
clip-graph.h
|
clip-graph.h
|
||||||
models/models.h
|
models/models.h
|
||||||
models/cogvlm.cpp
|
models/cogvlm.cpp
|
||||||
|
models/conformer.cpp
|
||||||
models/internvl.cpp
|
models/internvl.cpp
|
||||||
models/kimivl.cpp
|
models/kimivl.cpp
|
||||||
models/lfm2-audio-enc.cpp
|
|
||||||
models/llama4.cpp
|
models/llama4.cpp
|
||||||
models/llava.cpp
|
models/llava.cpp
|
||||||
models/minicpmv.cpp
|
models/minicpmv.cpp
|
||||||
|
|
|
||||||
|
|
@ -133,7 +133,7 @@
|
||||||
#define TN_TOK_BOI "v.boi"
|
#define TN_TOK_BOI "v.boi"
|
||||||
#define TN_TOK_EOI "v.eoi"
|
#define TN_TOK_EOI "v.eoi"
|
||||||
|
|
||||||
// lfm2
|
// (conformer) lfm2
|
||||||
#define TN_PRE_ENCODE_OUT "a.pre_encode.out.%s"
|
#define TN_PRE_ENCODE_OUT "a.pre_encode.out.%s"
|
||||||
#define TN_FFN_NORM "%s.blk.%d.ffn_norm.%s"
|
#define TN_FFN_NORM "%s.blk.%d.ffn_norm.%s"
|
||||||
#define TN_FFN_NORM_1 "%s.blk.%d.ffn_norm_1.%s"
|
#define TN_FFN_NORM_1 "%s.blk.%d.ffn_norm_1.%s"
|
||||||
|
|
|
||||||
|
|
@ -844,7 +844,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
} break;
|
} break;
|
||||||
case PROJECTOR_TYPE_LFM2A:
|
case PROJECTOR_TYPE_LFM2A:
|
||||||
{
|
{
|
||||||
builder = std::make_unique<clip_graph_lfm2_audio_enc>(ctx, img);
|
builder = std::make_unique<clip_graph_conformer>(ctx, img);
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
GGML_ABORT("missing cgraph builder");
|
GGML_ABORT("missing cgraph builder");
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
#include "models.h"
|
#include "models.h"
|
||||||
|
|
||||||
ggml_cgraph * clip_graph_lfm2_audio_enc::build() {
|
ggml_cgraph * clip_graph_conformer::build() {
|
||||||
const int n_frames = img.nx;
|
const int n_frames = img.nx;
|
||||||
const int n_pos = n_frames / 2;
|
const int n_pos = n_frames / 2;
|
||||||
const int n_pos_embd = (((((n_frames + 1) / 2) + 1) / 2 + 1) / 2) * 2 - 1;
|
const int n_pos_embd = (((((n_frames + 1) / 2) + 1) / 2 + 1) / 2) * 2 - 1;
|
||||||
|
|
@ -57,7 +57,7 @@ struct clip_graph_whisper_enc : clip_graph {
|
||||||
ggml_cgraph * build() override;
|
ggml_cgraph * build() override;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct clip_graph_lfm2_audio_enc : clip_graph {
|
struct clip_graph_conformer : clip_graph {
|
||||||
clip_graph_lfm2_audio_enc(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
clip_graph_conformer(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||||
ggml_cgraph * build() override;
|
ggml_cgraph * build() override;
|
||||||
};
|
};
|
||||||
|
|
|
||||||
|
|
@ -537,9 +537,10 @@ bool mtmd_audio_preprocessor_whisper::preprocess(
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// mtmd_audio_preprocessor_lfm2
|
// mtmd_audio_preprocessor_conformer
|
||||||
//
|
//
|
||||||
void mtmd_audio_preprocessor_lfm2::initialize() {
|
|
||||||
|
void mtmd_audio_preprocessor_conformer::initialize() {
|
||||||
g_cache.fill_sin_cos_table(hparams.audio_n_fft);
|
g_cache.fill_sin_cos_table(hparams.audio_n_fft);
|
||||||
g_cache.fill_hann_window(hparams.audio_window_len, true);
|
g_cache.fill_hann_window(hparams.audio_window_len, true);
|
||||||
g_cache.fill_mel_filterbank_matrix(
|
g_cache.fill_mel_filterbank_matrix(
|
||||||
|
|
@ -548,7 +549,7 @@ void mtmd_audio_preprocessor_lfm2::initialize() {
|
||||||
hparams.audio_sample_rate);
|
hparams.audio_sample_rate);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool mtmd_audio_preprocessor_lfm2::preprocess(
|
bool mtmd_audio_preprocessor_conformer::preprocess(
|
||||||
const float * samples,
|
const float * samples,
|
||||||
size_t n_samples,
|
size_t n_samples,
|
||||||
std::vector<mtmd_audio_mel> & output) {
|
std::vector<mtmd_audio_mel> & output) {
|
||||||
|
|
|
||||||
|
|
@ -33,8 +33,8 @@ struct mtmd_audio_preprocessor_whisper : mtmd_audio_preprocessor {
|
||||||
bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
|
bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct mtmd_audio_preprocessor_lfm2: mtmd_audio_preprocessor {
|
struct mtmd_audio_preprocessor_conformer : mtmd_audio_preprocessor {
|
||||||
mtmd_audio_preprocessor_lfm2(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
|
mtmd_audio_preprocessor_conformer(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
|
||||||
void initialize() override;
|
void initialize() override;
|
||||||
bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
|
bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
|
||||||
};
|
};
|
||||||
|
|
|
||||||
|
|
@ -328,7 +328,7 @@ struct mtmd_context {
|
||||||
audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
|
audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
|
||||||
break;
|
break;
|
||||||
case PROJECTOR_TYPE_LFM2A:
|
case PROJECTOR_TYPE_LFM2A:
|
||||||
audio_preproc = std::make_unique<mtmd_audio_preprocessor_lfm2>(ctx_a);
|
audio_preproc = std::make_unique<mtmd_audio_preprocessor_conformer>(ctx_a);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
GGML_ABORT("unsupported audio projector type");
|
GGML_ABORT("unsupported audio projector type");
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue