#include "mtmd-audio.h" #define _USE_MATH_DEFINES // for M_PI #include #include #include #include #include #include #include // some of the code here is copied from whisper.cpp constexpr bool DEBUG = false; void mtmd_audio_cache::fill_sin_cos_table(int n) { sin_vals.resize(n); cos_vals.resize(n); for (int i = 0; i < n; i++) { double theta = (2 * M_PI * i) / n; sin_vals[i] = sinf(theta); cos_vals[i] = cosf(theta); } } void mtmd_audio_cache::fill_hann_window(int length, bool periodic) { hann_window.resize(length); int offset = -1; if (periodic) { offset = 0; } for (int i = 0; i < length; i++) { hann_window[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset))); } } void mtmd_audio_cache::fill_mel_filterbank_matrix(int n_mel, int n_fft, int sample_rate, float fmin, float fmax, bool slaney_area_norm, float scale) { GGML_ASSERT(n_mel > 0 && n_fft > 1); if (fmax <= 0.0f) { fmax = 0.5f * sample_rate; } // Slaney scale (matches librosa default) const double min_log_hz = 1000.0; const double lin_slope = 3 / 200.; const double min_log_mel = min_log_hz * lin_slope; const double log_step = log(6.4) / 27.0; auto hz_to_mel = [min_log_hz, lin_slope, log_step, min_log_mel](const double f_hz) -> double { return (f_hz < min_log_hz) ? f_hz * lin_slope : min_log_mel + log(f_hz / min_log_hz) / log_step; }; auto mel_to_hz = [min_log_hz, lin_slope, log_step, min_log_mel](const double m) -> double { return (m < min_log_mel) ? m / lin_slope : min_log_hz * exp((m - min_log_mel) * log_step); }; // infer N_fft from n_fft_bins const double bin_hz_step = double(sample_rate) / double(n_fft); // mel grid: n_mel + 2 edges const double m_lo = hz_to_mel(fmin); const double m_hi = hz_to_mel(fmax); std::vector mel_pts(n_mel + 2); for (int i = 0; i < n_mel + 2; ++i) { mel_pts[i] = m_lo + (m_hi - m_lo) * (double(i) / (n_mel + 1)); } // convert to Hz std::vector hz_pts(n_mel + 2); for (int i = 0; i < n_mel + 2; ++i) { hz_pts[i] = mel_to_hz(mel_pts[i]); } const int n_fft_bins = n_fft / 2 + 1; // filterbank std::vector out(n_mel * n_fft_bins, 0); for (int m = 0; m < n_mel; ++m) { const double f_left = hz_pts[m]; const double f_center = hz_pts[m + 1]; const double f_right = hz_pts[m + 2]; const double denom_l = std::max(1e-30, f_center - f_left); const double denom_r = std::max(1e-30, f_right - f_center); const double enorm = slaney_area_norm ? (2.0 / std::max(1e-30, f_right - f_left)) : 1.0; for (int k = 0; k < n_fft_bins; ++k) { const double f = k * bin_hz_step; double w = 0.0; if (f >= f_left && f <= f_center) { w = (f - f_left) / denom_l; } else if (f > f_center && f <= f_right) { w = (f_right - f) / denom_r; } out[size_t(m) * size_t(n_fft_bins) + size_t(k)] = float(w * enorm * scale); } } filters.n_mel = n_mel; filters.n_fft = n_fft; filters.data = std::move(out); if (DEBUG) { // debug for (size_t i = 0; i < filters.data.size(); ++i) { if (filters.data[i] != 0.0f) { printf("filters[%zu] = %f\n", i, filters.data[i] * 1000.0f); } } } } // Unified DFT implementation for both forward and inverse transforms // Template parameters: // Inverse: false = DFT with exp(-2πi·k·n/N), no scaling // true = IDFT with exp(+2πi·k·n/N), scales by 1/N // RealInput: true = input is real-valued (stride 1), avoids imaginary computations // false = input is complex-valued (interleaved real/imag, stride 2) template static void dft_impl(const mtmd_audio_cache & cache, const float * in, int N, float * out) { const int n_sin_cos_vals = cache.sin_vals.size(); const int sin_cos_step = n_sin_cos_vals / N; constexpr float sign = Inverse ? 1.0f : -1.0f; const float scale = Inverse ? (1.0f / N) : 1.0f; for (int k = 0; k < N; k++) { float re = 0; float im = 0; for (int n = 0; n < N; n++) { int idx = (k * n * sin_cos_step) % n_sin_cos_vals; float cos_val = cache.cos_vals[idx]; float sin_val = cache.sin_vals[idx]; if constexpr (RealInput) { // Real input: in_im = 0, simplifies to: // re += in_re * cos_val // im += sign * in_re * sin_val float in_re = in[n]; re += in_re * cos_val; im += sign * in_re * sin_val; } else { float in_re = in[n * 2 + 0]; float in_im = in[n * 2 + 1]; // (a + bi) * (cos + sign*i*sin) = (a*cos - sign*b*sin) + (sign*a*sin + b*cos)i re += in_re * cos_val - sign * in_im * sin_val; im += sign * in_re * sin_val + in_im * cos_val; } } out[k * 2 + 0] = re * scale; out[k * 2 + 1] = im * scale; } } // Cooley-Tukey FFT/IFFT unified implementation // Template parameters: // Inverse: false = FFT with exp(-2πi·k/N), no scaling // true = IFFT with exp(+2πi·k/N), scales by 0.5 at each level // RealInput: true = input is real-valued (stride 1) // false = input is complex-valued (interleaved real/imag, stride 2) template static void fft_impl(const mtmd_audio_cache & cache, float * in, int N, float * out) { const int n_sin_cos_vals = cache.sin_vals.size(); if (N == 1) { out[0] = in[0]; if constexpr (RealInput) { out[1] = 0.0f; } else { out[1] = in[1]; } return; } const int half_N = N / 2; if (N - half_N * 2 == 1) { // Odd N: fall back to DFT dft_impl(cache, in, N, out); return; } // Split into even and odd if constexpr (RealInput) { // Real input: stride is 1, copy only real values float * even = in + N; for (int i = 0; i < half_N; ++i) { even[i] = in[2 * i]; } float * even_fft = out + 2 * N; fft_impl(cache, even, half_N, even_fft); float * odd = even; for (int i = 0; i < half_N; ++i) { odd[i] = in[2 * i + 1]; } float * odd_fft = even_fft + N; fft_impl(cache, odd, half_N, odd_fft); } else { // Complex input: stride is 2, copy complex pairs float * even = in + N * 2; for (int i = 0; i < half_N; ++i) { even[i * 2 + 0] = in[2 * i * 2 + 0]; even[i * 2 + 1] = in[2 * i * 2 + 1]; } float * even_fft = out + 2 * N; fft_impl(cache, even, half_N, even_fft); float * odd = even; for (int i = 0; i < half_N; ++i) { odd[i * 2 + 0] = in[(2 * i + 1) * 2 + 0]; odd[i * 2 + 1] = in[(2 * i + 1) * 2 + 1]; } float * odd_fft = even_fft + N; fft_impl(cache, odd, half_N, odd_fft); } float * even_fft = out + 2 * N; float * odd_fft = even_fft + N; const int sin_cos_step = n_sin_cos_vals / N; constexpr float sign = Inverse ? 1.0f : -1.0f; constexpr float scale = Inverse ? 0.5f : 1.0f; for (int k = 0; k < half_N; k++) { int idx = k * sin_cos_step; // t = 2*M_PI*k/N float re = cache.cos_vals[idx]; float im = sign * cache.sin_vals[idx]; float re_odd = odd_fft[2 * k + 0]; float im_odd = odd_fft[2 * k + 1]; out[2 * k + 0] = scale * (even_fft[2 * k + 0] + re * re_odd - im * im_odd); out[2 * k + 1] = scale * (even_fft[2 * k + 1] + re * im_odd + im * re_odd); out[2 * (k + half_N) + 0] = scale * (even_fft[2 * k + 0] - re * re_odd + im * im_odd); out[2 * (k + half_N) + 1] = scale * (even_fft[2 * k + 1] - re * im_odd - im * re_odd); } } // Forward FFT for real input (used by mel spectrogram) static void fft(const mtmd_audio_cache & cache, float * in, int N, float * out) { fft_impl(cache, in, N, out); } // Inverse FFT for complex input static void ifft(const mtmd_audio_cache & cache, float * in, int N, float * out) { fft_impl(cache, in, N, out); } struct filter_params { int32_t n_mel; int32_t n_fft_bins; int32_t hann_window_size; int32_t hop_length; int32_t sample_rate; bool center_padding = false; float preemph = 0.f; bool use_natural_log = false; bool norm_per_feature = false; }; static void log_mel_spectrogram_worker_thread(int ith, const float * hann, const std::vector & samples, int n_samples, int frame_size, int frame_step, int n_threads, const filter_params & params, const mtmd_audio_cache & cache, mtmd_audio_mel & out) { std::vector fft_in(frame_size * 2, 0.0); std::vector fft_out(frame_size * 2 * 2 * 2); int n_fft_bins = params.n_fft_bins; int i = ith; const auto & filters = cache.filters; // make sure n_fft == 1 + (WHISPER_N_FFT / 2), bin_0 to bin_nyquist GGML_ASSERT(n_fft_bins == 1 + (frame_size / 2)); GGML_ASSERT(cache.sin_vals.size() == cache.cos_vals.size()); // calculate FFT only when fft_in are not all zero for (; i < std::min(n_samples / frame_step + 1, out.n_len); i += n_threads) { const int offset = i * frame_step; // apply Hann window (~10% faster) for (int j = 0; j < std::min(frame_size, n_samples - offset); j++) { fft_in[j] = hann[j] * samples[offset + j]; } // fill the rest with zeros if (n_samples - offset < frame_size) { std::fill(fft_in.begin() + (n_samples - offset), fft_in.end(), 0.0); } // FFT fft(cache, fft_in.data(), frame_size, fft_out.data()); // Calculate modulus^2 of complex numbers // Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes inference quality problem? Interesting. for (int j = 0; j < n_fft_bins; j++) { fft_out[j] = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]); } // mel spectrogram for (int j = 0; j < out.n_mel; j++) { double sum = 0.0; // unroll loop (suggested by GH user @lunixbochs) int k = 0; for (k = 0; k < n_fft_bins - 3; k += 4) { size_t idx = size_t(j) * size_t(n_fft_bins) + size_t(k); sum += fft_out[k + 0] * filters.data[idx + 0] + fft_out[k + 1] * filters.data[idx + 1] + fft_out[k + 2] * filters.data[idx + 2] + fft_out[k + 3] * filters.data[idx + 3]; } // handle n_fft remainder for (; k < n_fft_bins; k++) { sum += fft_out[k] * filters.data[j * n_fft_bins + k]; } sum = params.use_natural_log ? log(sum + 5.960464477539063e-08) : log10(std::max(sum, 1e-10)); out.data[j * out.n_len + i] = sum; } } // Otherwise fft_out are all zero double sum = params.use_natural_log ? log(1e-10) : log10(1e-10); for (; i < out.n_len; i += n_threads) { for (int j = 0; j < out.n_mel; j++) { out.data[j * out.n_len + i] = sum; } } } // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L110-L157 static bool log_mel_spectrogram( const float * samples, const int n_samples_in, const int n_threads, const filter_params & params, const mtmd_audio_cache & cache, mtmd_audio_mel & out) { //const int64_t t_start_us = ggml_time_us(); out.n_len_org = n_samples_in; int n_samples = n_samples_in; // Hann window const float * hann = cache.hann_window.data(); const int frame_size = (params.n_fft_bins - 1) * 2; const int frame_step = params.hop_length; // Padding std::vector samples_padded; if (params.center_padding) { const auto pad_amount = frame_size / 2; samples_padded = std::vector(n_samples + 2 * pad_amount, 0); std::copy(samples, samples + n_samples, samples_padded.data() + pad_amount); samples = samples_padded.data(); n_samples = samples_padded.size(); } else { // existing padding logic int64_t stage_1_pad = params.sample_rate * 30; int64_t stage_2_pad = frame_size / 2; samples_padded.resize(n_samples + stage_1_pad + stage_2_pad * 2); std::copy(samples, samples + n_samples, samples_padded.begin() + stage_2_pad); // pad 30 seconds of zeros at the end of audio (480,000 samples) + reflective pad 200 samples at the end of audio std::fill(samples_padded.begin() + n_samples + stage_2_pad, samples_padded.begin() + n_samples + stage_1_pad + 2 * stage_2_pad, 0); // reflective pad 200 samples at the beginning of audio if (n_samples < stage_2_pad + 1) { // TODO: Handle short audio differently or return error return false; } std::reverse_copy(samples + 1, samples + 1 + stage_2_pad, samples_padded.begin()); } // preemphasis if (params.preemph) { const int pad_amount = frame_size / 2; const float preemph = 0.97f; float prev = samples_padded[pad_amount]; for (int i = pad_amount + 1; i + pad_amount < n_samples; ++i) { float cur = samples_padded[i]; samples_padded[i] = cur - preemph * prev; prev = cur; } } // pad hann window if it's smaller than frame_size // TODO: probably unnecessary here? (or better doing it in g_cache?) std::vector hann_window_padded; if (params.hann_window_size < frame_size) { hann_window_padded.resize(frame_size); const int padding = (frame_size - params.hann_window_size) / 2; std::copy(hann, hann + params.hann_window_size, &hann_window_padded[padding]); hann = hann_window_padded.data(); } out.n_mel = params.n_mel; out.n_len = (n_samples - frame_size) / frame_step + 1; // TODO: handle these checks better if (out.n_mel > 0 && (unsigned long)out.n_len > SIZE_MAX / out.n_mel) { LOG_ERR("%s: size overflow\n", __func__); return false; } if (n_samples < frame_size) { LOG_ERR("%s: not enough samples after padding\n", __func__); return false; } out.data.resize(out.n_mel * out.n_len); { std::vector workers(n_threads - 1); for (int iw = 0; iw < n_threads - 1; ++iw) { workers[iw] = std::thread(log_mel_spectrogram_worker_thread, iw + 1, hann, std::cref(samples_padded), n_samples, frame_size, frame_step, n_threads, std::cref(params), std::cref(cache), std::ref(out)); } // main thread log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples, frame_size, frame_step, n_threads, params, cache, out); for (int iw = 0; iw < n_threads - 1; ++iw) { workers[iw].join(); } } const int effective_n_len = n_samples_in / frame_step; if (params.norm_per_feature) { for (int i = 0; i < out.n_mel; i++) { double mean = 0; for (int j = 0; j < effective_n_len; ++j) { mean += out.data[i * out.n_len + j]; } mean /= effective_n_len; double var = 0.0; for (int j = 0; j < effective_n_len; ++j) { const double value = out.data[i * out.n_len + j] - mean; var += value * value; } var /= effective_n_len - 1; // unbiased const double mstd = std::sqrt(var + 1e-5); for (int j = 0; j < effective_n_len; ++j) { auto &value = out.data[i * out.n_len + j]; value = (value - mean) / mstd; } // pad the rest with zeros for (int j = effective_n_len; j < out.n_len; ++j) { out.data[i * out.n_len + j] = 0.0; } } } else { // clamping and normalization double mmax = -1e20; for (int i = 0; i < out.n_mel*out.n_len; i++) { if (out.data[i] > mmax) { mmax = out.data[i]; } } mmax -= 8.0; for (int i = 0; i < out.n_mel*out.n_len; i++) { if (out.data[i] < mmax) { out.data[i] = mmax; } out.data[i] = (out.data[i] + 4.0)/4.0; } } // Dump log_mel_spectrogram if (DEBUG) { std::ofstream outFile("log_mel_spectrogram.json"); outFile << "["; for (uint64_t i = 0; i < out.data.size() - 1; i++) { outFile << out.data[i] << ", "; } outFile << out.data[out.data.size() - 1] << "]"; outFile.close(); } return true; } // // mtmd_audio_preprocessor_whisper // void mtmd_audio_preprocessor_whisper::initialize() { cache.fill_sin_cos_table(hparams.audio_n_fft); cache.fill_hann_window(hparams.audio_window_len, true); cache.fill_mel_filterbank_matrix(hparams.n_mel_bins, hparams.audio_n_fft, hparams.audio_sample_rate); } bool mtmd_audio_preprocessor_whisper::preprocess(const float * samples, size_t n_samples, std::vector & output) { if (n_samples == 0) { // empty audio return false; } std::vector smpl; // if input is too short, pad with zeros // this is to avoid potential issues with stage1/2 padding in log_mel_spectrogram // TODO: maybe handle this better size_t min_samples = (size_t) hparams.audio_sample_rate * (hparams.audio_chunk_len + 1); // +1 second margin if (n_samples < min_samples) { smpl.resize(min_samples, 0.0f); std::memcpy(smpl.data(), samples, n_samples * sizeof(float)); samples = smpl.data(); n_samples = smpl.size(); } filter_params params; params.n_mel = hparams.n_mel_bins; params.n_fft_bins = 1 + (hparams.audio_n_fft / 2); params.hann_window_size = hparams.audio_window_len; params.hop_length = hparams.audio_hop_len; params.sample_rate = hparams.audio_sample_rate; params.center_padding = false; params.preemph = 0.0f; // disabled params.use_natural_log = false; params.norm_per_feature = false; // make sure the cache is initialized GGML_ASSERT(!cache.sin_vals.empty()); GGML_ASSERT(!cache.cos_vals.empty()); GGML_ASSERT(!cache.filters.data.empty()); mtmd_audio_mel out_full; bool ok = log_mel_spectrogram(samples, n_samples, 4, // n_threads params, cache, out_full); if (!ok) { return false; } // because the cgraph in clip.cpp only accepts 3000 frames each, we need to split the mel // we always expect the mel to have 3000 silent frames at the end if (DEBUG) { printf("output: n_mel = %d, n_len = %d\n", out_full.n_mel, out_full.n_len); } const size_t frames_per_chunk = 3000; GGML_ASSERT((size_t) out_full.n_len > frames_per_chunk); for (size_t off = 0; off < (size_t) out_full.n_len; off += frames_per_chunk) { int n_len = std::min(frames_per_chunk, (size_t) out_full.n_len - off); if ((size_t) n_len < frames_per_chunk) { break; // last uncomplete chunk will always be a padded chunk, safe to ignore } mtmd_audio_mel out_chunk; out_chunk.n_len = n_len; out_chunk.n_mel = out_full.n_mel; out_chunk.n_len_org = out_full.n_mel; // unused out_chunk.data.reserve(out_chunk.n_mel * out_chunk.n_len); for (int i = 0; i < out_full.n_mel; i++) { auto src = out_full.data.begin() + i * out_full.n_len + off; out_chunk.data.insert(out_chunk.data.end(), src, src + frames_per_chunk); } output.push_back(std::move(out_chunk)); } return true; } // // mtmd_audio_preprocessor_conformer // void mtmd_audio_preprocessor_conformer::initialize() { cache.fill_sin_cos_table(hparams.audio_n_fft); cache.fill_hann_window(hparams.audio_window_len, true); cache.fill_mel_filterbank_matrix(hparams.n_mel_bins, hparams.audio_n_fft, hparams.audio_sample_rate); } bool mtmd_audio_preprocessor_conformer::preprocess(const float * samples, size_t n_samples, std::vector & output) { // empty audio if (n_samples == 0) { return false; } filter_params params; params.n_mel = hparams.n_mel_bins; params.n_fft_bins = 1 + (hparams.audio_n_fft / 2); params.hann_window_size = hparams.audio_window_len; params.hop_length = hparams.audio_hop_len; params.sample_rate = hparams.audio_sample_rate; params.center_padding = true; params.preemph = 0.97f; params.use_natural_log = true; params.norm_per_feature = true; // make sure the cache is initialized GGML_ASSERT(!cache.sin_vals.empty()); GGML_ASSERT(!cache.cos_vals.empty()); GGML_ASSERT(!cache.filters.data.empty()); mtmd_audio_mel out_full; bool ok = log_mel_spectrogram(samples, n_samples, 4, // n_threads params, cache, out_full); if (!ok) { return false; } output.push_back(std::move(out_full)); return true; } // // mtmd_audio_streaming_istft implementation // mtmd_audio_streaming_istft::mtmd_audio_streaming_istft(int n_fft, int hop_length) : n_fft(n_fft), hop_length(hop_length), n_fft_bins(n_fft / 2 + 1), overlap_buffer(n_fft, 0.0f), window_sum_buffer(n_fft, 0.0f), padding_to_remove((n_fft - hop_length) / 2), ifft_in(n_fft * 2 * 4, 0.0f), // extra space for recursive IFFT ifft_out(n_fft * 2 * 4, 0.0f) { cache.fill_sin_cos_table(n_fft); cache.fill_hann_window(n_fft, true); } void mtmd_audio_streaming_istft::reset() { std::fill(overlap_buffer.begin(), overlap_buffer.end(), 0.0f); std::fill(window_sum_buffer.begin(), window_sum_buffer.end(), 0.0f); padding_to_remove = (n_fft - hop_length) / 2; } std::vector mtmd_audio_streaming_istft::process_frame(const float * frame_spectrum) { std::vector output(hop_length); // copy frequencies for (int j = 0; j < n_fft_bins; j++) { ifft_in[j * 2 + 0] = frame_spectrum[j * 2 + 0]; ifft_in[j * 2 + 1] = frame_spectrum[j * 2 + 1]; } // mirror negative frequencies for (int j = 1; j < n_fft_bins - 1; j++) { int mirror_idx = n_fft - j; ifft_in[mirror_idx * 2 + 0] = ifft_in[j * 2 + 0]; ifft_in[mirror_idx * 2 + 1] = -ifft_in[j * 2 + 1]; // conjugate } ifft(cache, ifft_in.data(), n_fft, ifft_out.data()); // update window sum and overlap buffer for (int j = 0; j < n_fft; j++) { window_sum_buffer[j] += cache.hann_window[j] * cache.hann_window[j]; overlap_buffer[j] += ifft_out[j * 2] * cache.hann_window[j]; } // extract hop_length samples with normalization for (int i = 0; i < hop_length; i++) { if (window_sum_buffer[i] > 1e-8f) { output[i] = overlap_buffer[i] / window_sum_buffer[i]; } else { output[i] = overlap_buffer[i]; } } // shift buffers left by hop_length std::copy(overlap_buffer.begin() + hop_length, overlap_buffer.end(), overlap_buffer.begin()); std::fill(overlap_buffer.end() - hop_length, overlap_buffer.end(), 0.0f); std::copy(window_sum_buffer.begin() + hop_length, window_sum_buffer.end(), window_sum_buffer.begin()); std::fill(window_sum_buffer.end() - hop_length, window_sum_buffer.end(), 0.0f); // Remove padding if needed int to_remove = std::min(padding_to_remove, (int) output.size()); padding_to_remove -= to_remove; output.erase(output.begin(), output.begin() + to_remove); return output; } std::vector mtmd_audio_streaming_istft::flush() { std::vector output; // Extract remaining samples from overlap buffer // Continue until we've extracted all meaningful samples int remaining = n_fft - hop_length; while (remaining > 0) { int chunk_size = std::min(remaining, hop_length); for (int i = 0; i < chunk_size; i++) { float sample; if (window_sum_buffer[i] > 1e-8f) { sample = overlap_buffer[i] / window_sum_buffer[i]; } else { sample = overlap_buffer[i]; } output.push_back(sample); } // Shift buffers std::copy(overlap_buffer.begin() + chunk_size, overlap_buffer.end(), overlap_buffer.begin()); std::fill(overlap_buffer.end() - chunk_size, overlap_buffer.end(), 0.0f); std::copy(window_sum_buffer.begin() + chunk_size, window_sum_buffer.end(), window_sum_buffer.begin()); std::fill(window_sum_buffer.end() - chunk_size, window_sum_buffer.end(), 0.0f); remaining -= chunk_size; } return output; }