#pragma once #include "ggml.h" #include "clip-model.h" #include #include #include #define MTMD_INTERNAL_HEADER struct mtmd_audio_mel { int n_len; int n_len_org; int n_mel; std::vector data; }; struct mtmd_audio_mel_filters { int32_t n_mel; int32_t n_fft; std::vector data; }; // cache for audio processing, each processor instance owns its own cache struct mtmd_audio_cache { std::vector sin_vals; std::vector cos_vals; std::vector hann_window; mtmd_audio_mel_filters filters; void fill_sin_cos_table(int n); void fill_hann_window(int length, bool periodic); // Build mel filterbank matrix [n_mel × n_fft_bins] at runtime. // n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257. void fill_mel_filterbank_matrix(int n_mel, int n_fft, int sample_rate, // e.g. 16000 float fmin = 0.0f, // e.g. 0.0 float fmax = -1.0f, // e.g. sr/2; pass -1 for auto bool slaney_area_norm = true, float scale = 1.0f // optional extra scaling ); }; struct mtmd_audio_preprocessor { const clip_hparams & hparams; mtmd_audio_preprocessor(const clip_ctx * ctx): hparams(*clip_get_hparams(ctx)) {} virtual ~mtmd_audio_preprocessor() = default; virtual void initialize() = 0; // NOT thread-safe virtual bool preprocess(const float * samples, size_t n_samples, std::vector & output) = 0; }; struct mtmd_audio_preprocessor_whisper : mtmd_audio_preprocessor { mtmd_audio_preprocessor_whisper(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {} void initialize() override; bool preprocess(const float * samples, size_t n_samples, std::vector & output) override; private: mtmd_audio_cache cache; }; struct mtmd_audio_preprocessor_conformer : mtmd_audio_preprocessor { mtmd_audio_preprocessor_conformer(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {} void initialize() override; bool preprocess(const float * samples, size_t n_samples, std::vector & output) override; private: mtmd_audio_cache cache; }; // // streaming ISTFT - converts spectrogram frames back to audio one frame at a time // struct mtmd_audio_streaming_istft { mtmd_audio_streaming_istft(int n_fft, int hop_length); // reset streaming state void reset(); // process a single STFT frame (streaming) // frame_spectrum: [n_fft_bins x 2] interleaved real/imag // returns: up to hop_length samples std::vector process_frame(const float * frame_spectrum); // flush remaining samples at end of stream std::vector flush(); private: int n_fft; int hop_length; int n_fft_bins; // Own cache for output processing mtmd_audio_cache cache; // Streaming state std::vector overlap_buffer; std::vector window_sum_buffer; int padding_to_remove; // Working buffers for IFFT std::vector ifft_in; std::vector ifft_out; };