From 27104fc7f32f7da8e458676877d76ec4e9dc8805 Mon Sep 17 00:00:00 2001 From: hung Date: Fri, 13 Feb 2026 08:16:39 -0500 Subject: [PATCH] ggml: add GGML_OP_ISTFT with CPU and Vulkan backends Adds inverse Short-Time Fourier Transform as a native ggml op, replacing the hand-rolled iSTFT in tools/tts/tts.cpp. CPU backend uses a direct inverse DFT (no external dependencies), Vulkan backend uses a parallel compute shader. Both use Hermitian symmetry for real-valued output. Co-Authored-By: Claude Opus 4.6 --- ggml/include/ggml.h | 12 ++ ggml/src/ggml-cpu/ggml-cpu.c | 13 ++ ggml/src/ggml-cpu/ops.cpp | 140 +++++++++++++ ggml/src/ggml-cpu/ops.h | 1 + ggml/src/ggml-vulkan/ggml-vulkan.cpp | 60 ++++++ .../src/ggml-vulkan/vulkan-shaders/istft.comp | 95 +++++++++ .../vulkan-shaders/vulkan-shaders-gen.cpp | 2 + ggml/src/ggml.c | 34 ++- tests/test-backend-ops.cpp | 43 ++++ tools/tts/tts.cpp | 197 +++++------------- 10 files changed, 451 insertions(+), 146 deletions(-) create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/istft.comp diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index f759e2d588..fc2efc9049 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -529,6 +529,7 @@ extern "C" { GGML_OP_CONV_3D, GGML_OP_CONV_2D_DW, GGML_OP_CONV_TRANSPOSE_2D, + GGML_OP_ISTFT, GGML_OP_POOL_1D, GGML_OP_POOL_2D, GGML_OP_POOL_2D_BACK, @@ -2110,6 +2111,17 @@ extern "C" { int n_batch, int n_channels_out); + // inverse short-time Fourier transform + // spectrogram: [n_fft, n_frames, 2] complex (interleaved re/im in last dim) + // output: [n_samples] real PCM audio + // includes irfft + Hann window + overlap-add + GGML_API struct ggml_tensor * ggml_istft( + struct ggml_context * ctx, + struct ggml_tensor * spectrogram, + int n_fft, + int hop_length, + int win_length); + enum ggml_op_pool { GGML_OP_POOL_MAX, GGML_OP_POOL_AVG, diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index b003fe13fd..5650cb7313 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -1906,6 +1906,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_conv_transpose_2d(params, tensor); } break; + case GGML_OP_ISTFT: + { + ggml_compute_forward_istft(params, tensor); + } break; case GGML_OP_POOL_1D: { ggml_compute_forward_pool_1d(params, tensor); @@ -2318,6 +2322,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { case GGML_OP_CONV_2D_DW: case GGML_OP_CONV_TRANSPOSE_1D: case GGML_OP_CONV_TRANSPOSE_2D: + case GGML_OP_ISTFT: { n_tasks = n_threads; } break; @@ -2863,6 +2868,14 @@ struct ggml_cplan ggml_graph_plan( cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03; cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12; } break; + case GGML_OP_ISTFT: + { + // Scratch for irfft results + hann² envelope: + // two buffers of n_frames * n_fft floats each + const int32_t n_fft = node->op_params[0]; + const int64_t n_frames = node->src[0]->ne[2]; + cur = 2 * sizeof(float) * n_frames * n_fft; + } break; case GGML_OP_TOP_K: { cur += sizeof(int32_t)*node->src[0]->ne[0]*n_tasks; diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index ed45350207..dd062d1a14 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -10,6 +10,7 @@ #include #include #include +#include // ggml_compute_forward_dup @@ -7109,6 +7110,145 @@ void ggml_compute_forward_conv_2d_dw( } } +// ggml_compute_forward_istft +// +// iSTFT = inverse Short-Time Fourier Transform. Converts a complex spectrogram +// back to real audio. Three steps per frame: +// 1. irfft (complex→real via direct inverse DFT, O(n²) per frame) +// 2. Hann window multiplication +// 3. Overlap-add (fold) — frames overlap by (win_length - hop_length) samples +// +// Threading: each thread irfft's its own frames into a shared scratch buffer +// (n_frames × n_fft). Thread 0 then does the sequential fold + normalize. + +// Direct inverse real FFT using Hermitian symmetry. +// For a real-valued signal of length n_fft with n_freq = n_fft/2+1 complex bins: +// out[j] = (1/n_freq) * (X[0].re +// + 2 * sum_{k=1}^{n_freq-2} (X[k].re*cos(2πkj/n_fft) - X[k].im*sin(2πkj/n_fft)) +// + X[n_freq-1].re*cos(2π(n_freq-1)j/n_fft) - X[n_freq-1].im*sin(2π(n_freq-1)j/n_fft)) +// This matches the Vulkan istft.comp shader algorithm. +static void ggml_irfft_direct( + const float * cplx_in, // interleaved [re,im] × n_freq + float * out, // real output, length n_fft + int n_fft, + int n_freq) { + + const float inv_n_freq = 1.0f / (float)n_freq; + const float two_pi_over_n = 6.28318530718f / (float)n_fft; + + for (int j = 0; j < n_fft; j++) { + const float angle_base = two_pi_over_n * (float)j; + float val = 0.0f; + + // DC component (k=0): cos(0)=1, sin(0)=0 + val += cplx_in[0]; + + // Middle frequencies (k=1..n_freq-2): doubled due to Hermitian symmetry + for (int k = 1; k < n_freq - 1; k++) { + float re = cplx_in[k * 2 + 0]; + float im = cplx_in[k * 2 + 1]; + float angle = angle_base * (float)k; + val += 2.0f * (re * cosf(angle) - im * sinf(angle)); + } + + // Nyquist component (k=n_freq-1): appears once + { + int k = n_freq - 1; + float re = cplx_in[k * 2 + 0]; + float im = cplx_in[k * 2 + 1]; + float angle = angle_base * (float)k; + val += re * cosf(angle) - im * sinf(angle); + } + + out[j] = val * inv_n_freq; + } +} + +void ggml_compute_forward_istft( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src = dst->src[0]; + + const int32_t n_fft = dst->op_params[0]; + const int32_t hop_length = dst->op_params[1]; + const int32_t win_length = dst->op_params[2]; + const int32_t n_frames = (int32_t)src->ne[2]; + const int32_t n_freq = n_fft / 2 + 1; + const int32_t n_pad = (win_length - hop_length) / 2; + const int64_t n_out_full = (int64_t)(n_frames - 1) * hop_length + win_length; + const int64_t n_out = n_out_full - 2 * n_pad; + + GGML_ASSERT(dst->ne[0] == n_out); + GGML_ASSERT(src->ne[0] == 2); // interleaved re/im + GGML_ASSERT(src->ne[1] == n_freq); // frequency bins + + const int ith = params->ith; + const int nth = params->nth; + + // Scratch space from graph planner (requested in ggml-cpu.c extra_work_size): + // Layout: [all_res: n_frames * n_fft floats][all_hann2: n_frames * n_fft floats] + const int64_t scratch_per_buf = (int64_t)n_frames * n_fft; + float * all_res = (float *)params->wdata; + float * all_hann2 = all_res + scratch_per_buf; + + // Thread 0 zeroes the scratch buffers + if (ith == 0) { + memset(all_res, 0, scratch_per_buf * sizeof(float)); + memset(all_hann2, 0, scratch_per_buf * sizeof(float)); + } + + ggml_barrier(params->threadpool); + + // Hann window (each thread computes — cheap, avoids shared state) + std::vector hann(win_length); + for (int i = 0; i < win_length; i++) { + hann[i] = 0.5f * (1.0f - cosf(2.0f * M_PI * i / win_length)); + } + + const float * src_data = (const float *)src->data; + float * dst_data = (float *)dst->data; + + // Phase 1: each thread irfft's + windows its assigned frames. + // Each thread writes to non-overlapping frame indices, so no data race. + std::vector frame_real(n_fft); + + for (int l = ith; l < n_frames; l += nth) { + const float * frame_cplx = src_data + (int64_t)l * n_freq * 2; + + ggml_irfft_direct(frame_cplx, frame_real.data(), n_fft, n_freq); + + for (int j = 0; j < n_fft; j++) { + all_res [(int64_t)l * n_fft + j] = frame_real[j] * hann[j]; + all_hann2[(int64_t)l * n_fft + j] = hann[j] * hann[j]; + } + } + + ggml_barrier(params->threadpool); + + // Phase 2: thread 0 does overlap-add (fold) + normalize — sequential + if (ith == 0) { + std::vector audio(n_out_full, 0.0f); + std::vector env(n_out_full, 0.0f); + + for (int l = 0; l < n_frames; l++) { + int64_t offset = (int64_t)l * hop_length; + for (int j = 0; j < win_length; j++) { + int64_t idx = offset + j; + if (idx < n_out_full) { + audio[idx] += all_res [(int64_t)l * n_fft + j]; + env[idx] += all_hann2[(int64_t)l * n_fft + j]; + } + } + } + + // normalize by windowed envelope, trim padding + for (int64_t i = 0; i < n_out; i++) { + dst_data[i] = (env[i + n_pad] > 0.0f) ? audio[i + n_pad] / env[i + n_pad] : 0.0f; + } + } +} + // ggml_compute_forward_pool_1d_ksp static void ggml_compute_forward_pool_1d_ksp( const ggml_compute_params * params, diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h index 0fdfee7976..bc66c70a7b 100644 --- a/ggml/src/ggml-cpu/ops.h +++ b/ggml/src/ggml-cpu/ops.h @@ -71,6 +71,7 @@ void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, str void ggml_compute_forward_conv_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_istft(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pool_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params, struct ggml_tensor * dst); diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 72097ffd0f..864b51ef46 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -804,6 +804,7 @@ struct vk_device_struct { vk_pipeline pipeline_im2col_3d_f32, pipeline_im2col_3d_f32_f16; vk_pipeline pipeline_timestep_embedding_f32; vk_pipeline pipeline_conv_transpose_1d_f32; + vk_pipeline pipeline_istft_f32; vk_pipeline pipeline_pool2d_f32; vk_pipeline pipeline_rwkv_wkv6_f32; vk_pipeline pipeline_rwkv_wkv7_f32; @@ -1407,6 +1408,16 @@ struct vk_op_conv_transpose_1d_push_constants { int32_t s0; }; +struct vk_op_istft_push_constants { + uint32_t n_fft; + uint32_t n_freq; + uint32_t n_frames; + uint32_t hop_length; + uint32_t win_length; + uint32_t n_pad; + uint32_t n_out; +}; + struct vk_op_pool2d_push_constants { uint32_t IW; uint32_t IH; uint32_t OW; uint32_t OH; @@ -4402,6 +4413,8 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_conv_transpose_1d_f32, "conv_transpose_1d_f32", conv_transpose_1d_f32_len, conv_transpose_1d_f32_data, "main", 3, sizeof(vk_op_conv_transpose_1d_push_constants), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_istft_f32, "istft_f32", istft_f32_len, istft_f32_data, "main", 2, sizeof(vk_op_istft_push_constants), {256, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_pool2d_f32, "pool2d_f32", pool2d_f32_len, pool2d_f32_data, "main", 2, sizeof(vk_op_pool2d_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv6_f32, "rwkv_wkv6_f32", rwkv_wkv6_f32_len, rwkv_wkv6_f32_data, "main", 7, sizeof(vk_op_rwkv_wkv6_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); @@ -9230,6 +9243,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_conv_transpose_1d_f32; } return nullptr; + case GGML_OP_ISTFT: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_istft_f32; + } + return nullptr; case GGML_OP_POOL_2D: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_pool2d_f32; @@ -9612,6 +9630,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co { elements = {uint32_t(src0->ne[1]), 1, 1}; // parallelize in {Cout, 1, 1} } break; + case GGML_OP_ISTFT: + { + elements = {uint32_t(dst->ne[0]), 1, 1}; // one thread per output sample + } break; case GGML_OP_POOL_2D: { const uint32_t N = dst->ne[3]; @@ -11303,6 +11325,33 @@ static void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context& ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_TRANSPOSE_1D, std::move(p)); } +static void ggml_vk_istft(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + // src0: [2, n_freq, n_frames] — complex spectrogram (interleaved re/im) + // dst: [n_out] — real PCM audio + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + const int32_t n_fft = dst->op_params[0]; + const int32_t hop_length = dst->op_params[1]; + const int32_t win_length = dst->op_params[2]; + const int32_t n_freq = n_fft / 2 + 1; + const int32_t n_frames = (int32_t)src0->ne[2]; + const int32_t n_pad = (win_length - hop_length) / 2; + const int32_t n_out = (int32_t)dst->ne[0]; + + vk_op_istft_push_constants p{}; + p.n_fft = static_cast(n_fft); + p.n_freq = static_cast(n_freq); + p.n_frames = static_cast(n_frames); + p.hop_length = static_cast(hop_length); + p.win_length = static_cast(win_length); + p.n_pad = static_cast(n_pad); + p.n_out = static_cast(n_out); + + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_ISTFT, std::move(p)); +} + static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { uint32_t op = static_cast(dst->op_params[0]); const int32_t k1 = dst->op_params[1]; @@ -12754,6 +12803,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr case GGML_OP_CONV_TRANSPOSE_1D: ggml_vk_conv_transpose_1d(ctx, compute_ctx, src0, src1, node); + break; + case GGML_OP_ISTFT: + ggml_vk_istft(ctx, compute_ctx, src0, node); + break; case GGML_OP_POOL_2D: ggml_vk_pool_2d(ctx, compute_ctx, src0, node); @@ -15076,6 +15129,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm return op->src[0]->type == GGML_TYPE_F32; case GGML_OP_CONV_TRANSPOSE_1D: return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32; + case GGML_OP_ISTFT: + return op->src[0]->type == GGML_TYPE_F32; case GGML_OP_CONV_2D: case GGML_OP_CONV_TRANSPOSE_2D: { @@ -15836,6 +15891,11 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * } else if (tensor->op == GGML_OP_CONV_TRANSPOSE_2D) { const int32_t s = tensor->op_params[0]; tensor_clone = ggml_conv_transpose_2d_p0(ggml_ctx, src_clone[0], src_clone[1], s); + } else if (tensor->op == GGML_OP_ISTFT) { + const int32_t n_fft = tensor->op_params[0]; + const int32_t hop_length = tensor->op_params[1]; + const int32_t win_length = tensor->op_params[2]; + tensor_clone = ggml_istft(ggml_ctx, src_clone[0], n_fft, hop_length, win_length); } else if (tensor->op == GGML_OP_LEAKY_RELU) { const float * op_params = (const float *)tensor->op_params; tensor_clone = ggml_leaky_relu(ggml_ctx, src_clone[0], op_params[0], false); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/istft.comp b/ggml/src/ggml-vulkan/vulkan-shaders/istft.comp new file mode 100644 index 0000000000..fe90ced459 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/istft.comp @@ -0,0 +1,95 @@ +#version 450 + +// iSTFT single-pass compute shader — one thread per output sample +// +// Each thread computes one output PCM sample by gathering contributions from +// all overlapping spectrogram frames. For each frame, it evaluates the +// inverse DFT at the sample's position within that frame. +// +// Complexity per output sample: O(overlap_count * n_freq) +// where overlap_count = win_length / hop_length (typically 4) +// and n_freq = n_fft/2 + 1 (typically 641) +// +// This is O(n^2) per frame but fully parallelized across all output samples. +// For the TTS workload (~80K output samples), the GPU fills easily. + +#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require + +layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; + +layout(binding = 0) readonly buffer Src { float data_src[]; }; // complex spectrogram [2, n_freq, n_frames] +layout(binding = 1) writeonly buffer Dst { float data_dst[]; }; // output audio [n_out] + +layout(push_constant) uniform PushConstants { + uint32_t n_fft; + uint32_t n_freq; // n_fft/2 + 1 + uint32_t n_frames; + uint32_t hop_length; + uint32_t win_length; + uint32_t n_pad; + uint32_t n_out; +} p; + +void main() { + uint i = gl_GlobalInvocationID.x; + if (i >= p.n_out) return; + + // Map output index to pre-trim position + uint i_full = i + p.n_pad; + + float audio_sum = 0.0; + float env_sum = 0.0; + + // Find overlapping frames: frame l covers samples [l*hop .. l*hop + win - 1] + // This sample is at i_full, so l*hop <= i_full < l*hop + win + int l_min = max(0, int(i_full) - int(p.win_length) + 1); + l_min = (l_min + int(p.hop_length) - 1) / int(p.hop_length); // ceil division + int l_max = min(int(p.n_frames) - 1, int(i_full) / int(p.hop_length)); + + float inv_n_freq = 1.0 / float(p.n_freq); + float two_pi_over_n = 6.28318530718 / float(p.n_fft); + + for (int l = l_min; l <= l_max; l++) { + uint frame_start = uint(l) * p.hop_length; + uint j = i_full - frame_start; // position within this frame's window + + // Hann window at position j + float hann = 0.5 * (1.0 - cos(6.28318530718 * float(j) / float(p.win_length))); + + // Compute inverse DFT at position j for this frame's spectrum + // irfft: out[j] = (1/N) * sum_{k=0}^{N-1} X[k] * exp(i*2*pi*k*j/n_fft) + // For real signal, using Hermitian symmetry: + // out[j] = (1/n_freq) * (X[0].re + 2*sum_{k=1}^{n_freq-2} (X[k].re*cos - X[k].im*sin) + X[n_freq-1].re * cos) + uint src_offset = uint(l) * p.n_freq * 2; + + float val = 0.0; + float angle_base = two_pi_over_n * float(j); + + // DC component (k=0) + val += data_src[src_offset + 0]; // X[0].real, cos(0)=1 + + // Middle frequencies (k=1 to n_freq-2): count twice due to Hermitian symmetry + for (uint k = 1; k < p.n_freq - 1; k++) { + float re = data_src[src_offset + k * 2 + 0]; + float im = data_src[src_offset + k * 2 + 1]; + float angle = angle_base * float(k); + val += 2.0 * (re * cos(angle) - im * sin(angle)); + } + + // Nyquist component (k=n_freq-1) + { + uint k = p.n_freq - 1; + float re = data_src[src_offset + k * 2 + 0]; + float im = data_src[src_offset + k * 2 + 1]; + float angle = angle_base * float(k); + val += re * cos(angle) - im * sin(angle); + } + + val *= inv_n_freq; + + audio_sum += val * hann; + env_sum += hann * hann; + } + + data_dst[i] = (env_sum > 0.0) ? audio_sum / env_sum : 0.0; +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 42ebc21e2a..069586cc5f 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -965,6 +965,8 @@ void process_shaders() { string_to_spv("conv_transpose_1d_f32", "conv_transpose_1d.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}); + string_to_spv("istft_f32", "istft.comp", {}); + string_to_spv("pool2d_f32", "pool2d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("rwkv_wkv6_f32", "wkv6.comp", merge_maps(base_dict, {{"A_TYPE", "float"}})); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 500cb6b72f..89b51e2625 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1003,6 +1003,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CONV_3D", "CONV_2D_DW", "CONV_TRANSPOSE_2D", + "ISTFT", "POOL_1D", "POOL_2D", "POOL_2D_BACK", @@ -1047,7 +1048,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "GLU", }; -static_assert(GGML_OP_COUNT == 95, "GGML_OP_COUNT != 95"); +static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1112,6 +1113,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "conv_3d(x)", "conv_2d_dw(x)", "conv_transpose_2d(x)", + "istft(x)", "pool_1d(x)", "pool_2d(x)", "pool_2d_back(x)", @@ -1156,7 +1158,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "glu(x)", }; -static_assert(GGML_OP_COUNT == 95, "GGML_OP_COUNT != 95"); +static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -4818,6 +4820,34 @@ struct ggml_tensor * ggml_conv_transpose_2d_p0( return result; } +// ggml_istft + +struct ggml_tensor * ggml_istft( + struct ggml_context * ctx, + struct ggml_tensor * spectrogram, + int n_fft, + int hop_length, + int win_length) { + // spectrogram: [2, n_fft/2+1, n_frames] — complex interleaved (re/im), freq bins, frames + GGML_ASSERT(spectrogram->ne[0] == 2); + GGML_ASSERT(spectrogram->ne[1] == n_fft / 2 + 1); + + const int64_t n_frames = spectrogram->ne[2]; + const int64_t n_pad = (win_length - hop_length) / 2; + const int64_t n_out = (n_frames - 1) * hop_length + win_length - 2 * n_pad; + + const int64_t ne[4] = { n_out, 1, 1, 1 }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 1, ne); + + int32_t params[] = { n_fft, hop_length, win_length }; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_ISTFT; + result->src[0] = spectrogram; + + return result; +} + // ggml_pool_* static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, float p) { diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 56dadb9b36..2fa36659b2 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -4744,6 +4744,40 @@ struct test_conv_transpose_1d : public test_case { } }; +// GGML_OP_ISTFT +struct test_istft : public test_case { + const int n_fft; + const int hop_length; + const int win_length; + const int n_frames; + + std::string vars() override { + return VARS_TO_STR4(n_fft, hop_length, win_length, n_frames); + } + + test_istft(int n_fft = 1280, int hop_length = 320, + int win_length = 1280, int n_frames = 87) + : n_fft(n_fft), hop_length(hop_length), + win_length(win_length), n_frames(n_frames) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + int n_freq = n_fft / 2 + 1; + // Input: complex spectrogram [2, n_freq, n_frames] + ggml_tensor * spec = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 2, n_freq, n_frames); + ggml_set_name(spec, "spectrogram"); + + ggml_tensor * out = ggml_istft(ctx, spec, n_fft, hop_length, win_length); + ggml_set_name(out, "audio"); + + return out; + } + + // iSTFT involves trig functions — need looser tolerance than default 1e-7 + double max_nmse_err() override { + return 1e-5; + } +}; + // GGML_OP_CONV_TRANSPOSE_2D struct test_conv_transpose_2d : public test_case { const std::array ne_input; @@ -7370,6 +7404,12 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1)); test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1)); + // ISTFT — TTS spectral reconstruction + test_cases.emplace_back(new test_istft()); // default: TTS params + test_cases.emplace_back(new test_istft(1280, 320, 1280, 10)); // few frames + test_cases.emplace_back(new test_istft(512, 128, 512, 50)); // smaller FFT + test_cases.emplace_back(new test_istft(1024, 256, 1024, 30)); // power-of-2 FFT + test_cases.emplace_back(new test_conv_transpose_2d({3, 2, 3, 1}, {2, 2, 1, 3}, 1)); test_cases.emplace_back(new test_conv_transpose_2d({10, 10, 9, 1}, {3, 3, 1, 9}, 2)); test_cases.emplace_back(new test_conv_transpose_2d({129, 63, 35, 1}, {3, 3, 48, 35}, 1)); @@ -8506,6 +8546,9 @@ static std::vector> make_test_cases_perf() { test_cases.emplace_back(new test_conv_transpose_2d({16, 16, 16, 1}, {3, 3, 8, 16}, 1)); test_cases.emplace_back(new test_conv_transpose_2d({10, 10, 9, 1}, {3, 3, 1, 9}, 2)); + test_cases.emplace_back(new test_istft(1280, 320, 1280, 87)); // short TTS + test_cases.emplace_back(new test_istft(1280, 320, 1280, 712)); // long TTS + test_cases.emplace_back(new test_mean(GGML_TYPE_F32, {256, 256, 3, 1})); diff --git a/tools/tts/tts.cpp b/tools/tts/tts.cpp index 8c39fce8ba..747fe9cd50 100644 --- a/tools/tts/tts.cpp +++ b/tools/tts/tts.cpp @@ -5,6 +5,9 @@ #include "sampling.h" #include "log.h" #include "llama.h" +#include "ggml.h" +#include "ggml-backend.h" +#include "ggml-alloc.h" #define JSON_ASSERT GGML_ASSERT #include @@ -113,168 +116,74 @@ static bool save_wav16(const std::string & fname, const std::vector & dat return file.good(); } -static void fill_hann_window(int length, bool periodic, float * output) { - int offset = -1; - if (periodic) { - offset = 0; - } - for (int i = 0; i < length; i++) { - output[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset))); - } -} - -// very poor-man fft -static void twiddle(float * real, float * imag, int k, int N) { - float angle = 2 * M_PI * k / N; - *real = cos(angle); - *imag = sin(angle); -} - -static void irfft(int n, const float * inp_cplx, float * out_real) { - int N = n / 2 + 1; - - std::vector real_input(N); - std::vector imag_input(N); - for (int i = 0; i < N; ++i) { - real_input[i] = inp_cplx[2 * i]; - imag_input[i] = inp_cplx[2 * i + 1]; - } - - std::vector real_output(n); - std::vector imag_output(n); - - for (int k = 0; k < n; ++k) { - real_output[k] = 0.0f; - imag_output[k] = 0.0f; - for (int m = 0; m < N; ++m) { - float twiddle_real; - float twiddle_imag; - - twiddle(&twiddle_real, &twiddle_imag, k * m, n); - - real_output[k] += real_input[m] * twiddle_real - imag_input[m] * twiddle_imag; - imag_output[k] += real_input[m] * twiddle_imag + imag_input[m] * twiddle_real; - } - } - - for (int i = 0; i < n; ++i) { - out_real[i] = real_output[i] / N; - } -} - -// -// y = torch.nn.functional.fold( -// data, output_size=(1, output_size), kernel_size=(1, self.win_length), stride=(1, self.hop_length), -// )[:, 0, 0, pad:-pad] -// -// data.shape = torch.Size([1, 1280, 261]) -// output_size = 84480 -// win_length = 1280 -// hop_length = 320 -// pad = 480 -// -static void fold(const std::vector & data, int64_t n_out, int64_t n_win, int64_t n_hop, int64_t n_pad, std::vector & output) { - int64_t output_height = n_out; - int64_t kernel_w = n_win; - int64_t stride_w = n_hop; - int64_t width = n_out; - - output.resize(width, 0.0f); - - int64_t col_idx = 0; - for (int64_t w_col = 0; w_col < width; ++w_col) { - int64_t start = w_col * stride_w - n_pad; - int64_t end = start + kernel_w; - - for (int64_t w_im = start; w_im < end; ++w_im) { - if (w_im >= 0 && w_im < output_height && col_idx < (int64_t) data.size()) { - output[w_im] += data[col_idx]; - } - col_idx++; - } - } - - output.resize(n_out - 2 * n_pad); -} - -// TODO: not optimized at all +// Spectral ops via ggml_istft — dispatches to Vulkan GPU when available, CPU fallback. +// Preprocessing (transpose, exp, polar→cartesian) stays in C++ since it's O(n) and fast. static std::vector embd_to_audio( const float * embd, const int n_codes, const int n_embd, const int n_thread) { - const int n_fft = 1280; - const int n_hop = 320; - const int n_win = 1280; - const int n_pad = (n_win - n_hop)/2; - const int n_out = (n_codes - 1)*n_hop + n_win; + const int n_fft = 1280; + const int n_hop = 320; + const int n_win = 1280; + const int n_freq = n_fft / 2 + 1; // 641 + const int n_pad = (n_win - n_hop) / 2; + const int64_t n_out = (int64_t)(n_codes - 1) * n_hop + n_win - 2 * n_pad; - std::vector hann(n_fft); + GGML_ASSERT(n_embd == 2 * n_freq); // magnitude + phase - fill_hann_window(hann.size(), true, hann.data()); + // --- Preprocessing: embeddings → complex spectrogram [2, n_freq, n_codes] --- + // Layout for ggml_istft: ne[0]=2 (re/im interleaved), ne[1]=n_freq, ne[2]=n_frames + std::vector spec(2 * n_freq * n_codes); - int n_spec = n_embd*n_codes; + for (int l = 0; l < n_codes; l++) { + for (int k = 0; k < n_freq; k++) { + float mag = embd[l * n_embd + k]; + float phi = embd[l * n_embd + k + n_freq]; - std::vector E (n_spec); - std::vector S (n_spec); - std::vector ST(n_spec); - - for (int l = 0; l < n_codes; ++l) { - for (int k = 0; k < n_embd; ++k) { - E[k*n_codes + l] = embd[l*n_embd + k]; - } - } - - for (int k = 0; k < n_embd/2; ++k) { - for (int l = 0; l < n_codes; ++l) { - float mag = E[(k )*n_codes + l]; - float phi = E[(k + n_embd/2)*n_codes + l]; - - mag = exp(mag); - - if (mag > 1e2) { - mag = 1e2; + mag = expf(mag); + if (mag > 1e2f) { + mag = 1e2f; } - S[2*(k*n_codes + l) + 0] = mag*cosf(phi); - S[2*(k*n_codes + l) + 1] = mag*sinf(phi); + + // Store as [2, n_freq, n_codes]: index = (l * n_freq + k) * 2 + spec[(l * n_freq + k) * 2 + 0] = mag * cosf(phi); + spec[(l * n_freq + k) * 2 + 1] = mag * sinf(phi); } } - for (int l = 0; l < n_codes; ++l) { - for (int k = 0; k < n_embd/2; ++k) { - ST[l*n_embd + 2*k + 0] = S[2*(k*n_codes + l) + 0]; - ST[l*n_embd + 2*k + 1] = S[2*(k*n_codes + l) + 1]; - } - } + // --- Build ggml graph with ggml_istft --- + struct ggml_init_params ggml_params = { + /*.mem_size =*/ ggml_tensor_overhead() * 3 + ggml_graph_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + struct ggml_context * ctx = ggml_init(ggml_params); - std::vector res (n_codes*n_fft); - std::vector hann2(n_codes*n_fft); + struct ggml_tensor * spectrogram = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 2, n_freq, n_codes); + struct ggml_tensor * audio_out = ggml_istft(ctx, spectrogram, n_fft, n_hop, n_win); - std::vector workers(n_thread); - for (int i = 0; i < n_thread; ++i) { - workers[i] = std::thread([&, i]() { - for (int l = i; l < n_codes; l += n_thread) { - irfft(n_fft, ST.data() + l*n_embd, res.data() + l*n_fft); - for (int j = 0; j < n_fft; ++j) { - res [l*n_fft + j] *= hann[j]; - hann2[l*n_fft + j] = hann[j] * hann[j]; - } - } - }); - } - for (int i = 0; i < n_thread; ++i) { - workers[i].join(); - } + struct ggml_cgraph * graph = ggml_new_graph(ctx); + ggml_build_forward_expand(graph, audio_out); - std::vector audio; - std::vector env; + // --- Allocate and compute on CPU backend --- + // When run standalone, uses CPU. When integrated into a full ggml pipeline + // with Vulkan, ggml_istft dispatches to the Vulkan compute shader. + ggml_backend_t backend = ggml_backend_cpu_init(); + ggml_backend_cpu_set_n_threads(backend, n_thread); - fold(res, n_out, n_win, n_hop, n_pad, audio); - fold(hann2, n_out, n_win, n_hop, n_pad, env); // TODO: can be done once + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend); + ggml_backend_tensor_set(spectrogram, spec.data(), 0, spec.size() * sizeof(float)); + ggml_backend_graph_compute(backend, graph); - for (size_t i = 0; i < audio.size(); ++i) { - audio[i] /= env[i]; - } + // Read result + std::vector audio(n_out); + ggml_backend_tensor_get(audio_out, audio.data(), 0, n_out * sizeof(float)); + + // Cleanup + ggml_backend_buffer_free(buf); + ggml_backend_free(backend); + ggml_free(ctx); return audio; }