mtmd: refactor audio preprocessing (#17978)
* mtmd: refactor audio preprocessing * refactor Co-authored-by: Tarek <tdakhran@users.noreply.github.com> * wip * wip (2) * improve constructor * fix use_natural_log * fix padding for short input * clean up * remove need_chunking --------- Co-authored-by: Tarek <tdakhran@users.noreply.github.com>
This commit is contained in:
parent
4a4f7e6550
commit
96a181a933
|
|
@ -65,6 +65,13 @@ struct clip_hparams {
|
||||||
int32_t n_mel_bins = 0; // whisper preprocessor
|
int32_t n_mel_bins = 0; // whisper preprocessor
|
||||||
int32_t proj_stack_factor = 0; // ultravox
|
int32_t proj_stack_factor = 0; // ultravox
|
||||||
|
|
||||||
|
// audio-to-mel preprocessor params
|
||||||
|
int32_t audio_chunk_len = -1; // in seconds
|
||||||
|
int32_t audio_sample_rate = -1;
|
||||||
|
int32_t audio_n_fft = -1;
|
||||||
|
int32_t audio_window_len = -1;
|
||||||
|
int32_t audio_hop_len = -1;
|
||||||
|
|
||||||
// legacy
|
// legacy
|
||||||
bool has_llava_projector = false;
|
bool has_llava_projector = false;
|
||||||
int minicpmv_version = 0;
|
int minicpmv_version = 0;
|
||||||
|
|
@ -278,3 +285,5 @@ struct clip_model {
|
||||||
|| proj_type == PROJECTOR_TYPE_VOXTRAL;
|
|| proj_type == PROJECTOR_TYPE_VOXTRAL;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx);
|
||||||
|
|
|
||||||
|
|
@ -1170,11 +1170,15 @@ struct clip_model_loader {
|
||||||
model.proj_type == PROJECTOR_TYPE_VOXTRAL ||
|
model.proj_type == PROJECTOR_TYPE_VOXTRAL ||
|
||||||
model.proj_type == PROJECTOR_TYPE_GLMA;
|
model.proj_type == PROJECTOR_TYPE_GLMA;
|
||||||
get_u32(KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor, require_stack);
|
get_u32(KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor, require_stack);
|
||||||
if (hparams.n_mel_bins != 128) {
|
|
||||||
throw std::runtime_error(string_format("%s: only 128 mel bins are supported for ultravox\n", __func__));
|
|
||||||
}
|
|
||||||
hparams.ffn_op = FFN_GELU_ERF;
|
hparams.ffn_op = FFN_GELU_ERF;
|
||||||
log_ffn_op = "gelu_erf"; // temporary solution for logging
|
log_ffn_op = "gelu_erf"; // temporary solution for logging
|
||||||
|
|
||||||
|
// audio preprocessing params
|
||||||
|
hparams.audio_chunk_len = 30; // in seconds
|
||||||
|
hparams.audio_sample_rate = 16000;
|
||||||
|
hparams.audio_n_fft = 400;
|
||||||
|
hparams.audio_window_len = 400;
|
||||||
|
hparams.audio_hop_len = 160;
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
|
|
@ -1212,6 +1216,11 @@ struct clip_model_loader {
|
||||||
LOG_INF("\n--- audio hparams ---\n");
|
LOG_INF("\n--- audio hparams ---\n");
|
||||||
LOG_INF("%s: n_mel_bins: %d\n", __func__, hparams.n_mel_bins);
|
LOG_INF("%s: n_mel_bins: %d\n", __func__, hparams.n_mel_bins);
|
||||||
LOG_INF("%s: proj_stack_factor: %d\n", __func__, hparams.proj_stack_factor);
|
LOG_INF("%s: proj_stack_factor: %d\n", __func__, hparams.proj_stack_factor);
|
||||||
|
LOG_INF("%s: audio_chunk_len: %d\n", __func__, hparams.audio_chunk_len);
|
||||||
|
LOG_INF("%s: audio_sample_rate: %d\n", __func__, hparams.audio_sample_rate);
|
||||||
|
LOG_INF("%s: audio_n_fft: %d\n", __func__, hparams.audio_n_fft);
|
||||||
|
LOG_INF("%s: audio_window_len: %d\n", __func__, hparams.audio_window_len);
|
||||||
|
LOG_INF("%s: audio_hop_len: %d\n", __func__, hparams.audio_hop_len);
|
||||||
}
|
}
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
|
LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
|
||||||
|
|
@ -3478,3 +3487,7 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel
|
||||||
batch->entries.push_back(clip_image_f32_ptr(audio));
|
batch->entries.push_back(clip_image_f32_ptr(audio));
|
||||||
batch->is_audio = true;
|
batch->is_audio = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) {
|
||||||
|
return &ctx->model.hparams;
|
||||||
|
}
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -1,6 +1,7 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
#include "clip-model.h"
|
||||||
|
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
@ -8,18 +9,7 @@
|
||||||
|
|
||||||
#define MTMD_INTERNAL_HEADER
|
#define MTMD_INTERNAL_HEADER
|
||||||
|
|
||||||
#define WHISPER_ASSERT GGML_ASSERT
|
struct mtmd_audio_mel {
|
||||||
|
|
||||||
#define WHISPER_SAMPLE_RATE 16000
|
|
||||||
#define WHISPER_N_FFT 400
|
|
||||||
#define WHISPER_HOP_LENGTH 160
|
|
||||||
#define WHISPER_CHUNK_SIZE 30
|
|
||||||
|
|
||||||
#define COMMON_SAMPLE_RATE 16000
|
|
||||||
|
|
||||||
namespace whisper_preprocessor {
|
|
||||||
|
|
||||||
struct whisper_mel {
|
|
||||||
int n_len;
|
int n_len;
|
||||||
int n_len_org;
|
int n_len_org;
|
||||||
int n_mel;
|
int n_mel;
|
||||||
|
|
@ -27,23 +17,18 @@ struct whisper_mel {
|
||||||
std::vector<float> data;
|
std::vector<float> data;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct whisper_filters {
|
struct mtmd_audio_preprocessor {
|
||||||
int32_t n_mel;
|
const clip_hparams & hparams;
|
||||||
int32_t n_fft;
|
|
||||||
|
|
||||||
std::vector<float> data;
|
mtmd_audio_preprocessor(const clip_ctx * ctx): hparams(*clip_get_hparams(ctx)) {}
|
||||||
|
|
||||||
|
virtual ~mtmd_audio_preprocessor() = default;
|
||||||
|
virtual void initialize() = 0; // NOT thread-safe
|
||||||
|
virtual bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
bool preprocess_audio(
|
struct mtmd_audio_preprocessor_whisper : mtmd_audio_preprocessor {
|
||||||
const float * samples,
|
mtmd_audio_preprocessor_whisper(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
|
||||||
size_t n_samples,
|
void initialize() override;
|
||||||
const whisper_filters & filters,
|
bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
|
||||||
std::vector<whisper_mel> & output);
|
};
|
||||||
|
|
||||||
} // namespace whisper_preprocessor
|
|
||||||
|
|
||||||
namespace whisper_precalc_filters {
|
|
||||||
|
|
||||||
whisper_preprocessor::whisper_filters get_128_bins();
|
|
||||||
|
|
||||||
} // namespace whisper_precalc_filters
|
|
||||||
|
|
|
||||||
|
|
@ -151,8 +151,7 @@ struct mtmd_context {
|
||||||
// string template for slice image delimiters with row/col (idefics3)
|
// string template for slice image delimiters with row/col (idefics3)
|
||||||
std::string sli_img_start_tmpl;
|
std::string sli_img_start_tmpl;
|
||||||
|
|
||||||
// for whisper, we pre-calculate the mel filter bank
|
std::unique_ptr<mtmd_audio_preprocessor> audio_preproc;
|
||||||
whisper_preprocessor::whisper_filters w_filters;
|
|
||||||
|
|
||||||
// TODO @ngxson : add timings
|
// TODO @ngxson : add timings
|
||||||
|
|
||||||
|
|
@ -317,14 +316,25 @@ struct mtmd_context {
|
||||||
GGML_ASSERT(ctx_a != nullptr);
|
GGML_ASSERT(ctx_a != nullptr);
|
||||||
projector_type proj = clip_get_projector_type(ctx_a);
|
projector_type proj = clip_get_projector_type(ctx_a);
|
||||||
|
|
||||||
if (clip_has_whisper_encoder(ctx_a)) {
|
|
||||||
// TODO @ngxson : check if model n_mel is 128 or 80
|
|
||||||
w_filters = whisper_precalc_filters::get_128_bins();
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
|
LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
|
||||||
" https://github.com/ggml-org/llama.cpp/discussions/13759\n", __func__);
|
" https://github.com/ggml-org/llama.cpp/discussions/13759\n", __func__);
|
||||||
|
|
||||||
|
// set preprocessor
|
||||||
|
switch (proj) {
|
||||||
|
case PROJECTOR_TYPE_QWEN2A:
|
||||||
|
case PROJECTOR_TYPE_QWEN25O:
|
||||||
|
case PROJECTOR_TYPE_ULTRAVOX:
|
||||||
|
case PROJECTOR_TYPE_VOXTRAL:
|
||||||
|
audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ABORT("unsupported audio projector type");
|
||||||
|
}
|
||||||
|
|
||||||
|
// initialize audio preprocessor
|
||||||
|
audio_preproc->initialize();
|
||||||
|
|
||||||
|
// set special tokens
|
||||||
if (proj == PROJECTOR_TYPE_QWEN2A) {
|
if (proj == PROJECTOR_TYPE_QWEN2A) {
|
||||||
// <|audio_bos|> ... (embeddings) ... <|audio_eos|>
|
// <|audio_bos|> ... (embeddings) ... <|audio_eos|>
|
||||||
aud_beg = "<|audio_bos|>";
|
aud_beg = "<|audio_bos|>";
|
||||||
|
|
@ -653,11 +663,10 @@ struct mtmd_tokenizer {
|
||||||
}
|
}
|
||||||
|
|
||||||
// preprocess audio
|
// preprocess audio
|
||||||
GGML_ASSERT(ctx->w_filters.n_mel); // make sure we have filter preloaded
|
std::vector<mtmd_audio_mel> mel_spec_chunks;
|
||||||
std::vector<whisper_preprocessor::whisper_mel> mel_spec_chunks;
|
|
||||||
const float * samples = (const float *)bitmap->data.data();
|
const float * samples = (const float *)bitmap->data.data();
|
||||||
size_t n_samples = bitmap->data.size() / sizeof(float);
|
size_t n_samples = bitmap->data.size() / sizeof(float);
|
||||||
bool ok = whisper_preprocessor::preprocess_audio(samples, n_samples, ctx->w_filters, mel_spec_chunks);
|
bool ok = ctx->audio_preproc->preprocess(samples, n_samples, mel_spec_chunks);
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
LOG_ERR("Unable to preprocess audio\n");
|
LOG_ERR("Unable to preprocess audio\n");
|
||||||
return 2;
|
return 2;
|
||||||
|
|
@ -863,8 +872,7 @@ int mtmd_get_audio_bitrate(mtmd_context * ctx) {
|
||||||
if (!ctx->ctx_a) {
|
if (!ctx->ctx_a) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
// for now, we assume that all audio models have the same bitrate
|
return clip_get_hparams(ctx->ctx_a)->audio_sample_rate;
|
||||||
return 16000; // 16kHz
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue