ggml: add GGML_OP_ISTFT with CPU and Vulkan backends

Adds inverse Short-Time Fourier Transform as a native ggml op, replacing the hand-rolled iSTFT in tools/tts/tts.cpp. CPU backend uses a direct inverse DFT (no external dependencies), Vulkan backend uses a parallel compute shader. Both use Hermitian symmetry for real-valued output. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 08:16:39 -05:00 · 2026-02-13 08:16:39 -05:00 · 27104fc7f3
parent c03a5a46f0
commit 27104fc7f3
10 changed files with 451 additions and 146 deletions
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -529,6 +529,7 @@ extern "C" {
        GGML_OP_CONV_3D,
        GGML_OP_CONV_2D_DW,
        GGML_OP_CONV_TRANSPOSE_2D,
+        GGML_OP_ISTFT,
        GGML_OP_POOL_1D,
        GGML_OP_POOL_2D,
        GGML_OP_POOL_2D_BACK,
@ -2110,6 +2111,17 @@ extern "C" {
            int                   n_batch,
            int                   n_channels_out);

+    // inverse short-time Fourier transform
+    // spectrogram: [n_fft, n_frames, 2] complex (interleaved re/im in last dim)
+    // output: [n_samples] real PCM audio
+    // includes irfft + Hann window + overlap-add
+    GGML_API struct ggml_tensor * ggml_istft(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * spectrogram,
+            int                   n_fft,
+            int                   hop_length,
+            int                   win_length);
+
    enum ggml_op_pool {
        GGML_OP_POOL_MAX,
        GGML_OP_POOL_AVG,
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@ -1906,6 +1906,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
            {
                ggml_compute_forward_conv_transpose_2d(params, tensor);
            } break;
+        case GGML_OP_ISTFT:
+            {
+                ggml_compute_forward_istft(params, tensor);
+            } break;
        case GGML_OP_POOL_1D:
            {
                ggml_compute_forward_pool_1d(params, tensor);
@ -2318,6 +2322,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
        case GGML_OP_CONV_2D_DW:
        case GGML_OP_CONV_TRANSPOSE_1D:
        case GGML_OP_CONV_TRANSPOSE_2D:
+        case GGML_OP_ISTFT:
            {
                n_tasks = n_threads;
            } break;
@ -2863,6 +2868,14 @@ struct ggml_cplan ggml_graph_plan(
                        cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
                        cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
                    } break;
+                case GGML_OP_ISTFT:
+                    {
+                        // Scratch for irfft results + hann² envelope:
+                        // two buffers of n_frames * n_fft floats each
+                        const int32_t n_fft    = node->op_params[0];
+                        const int64_t n_frames = node->src[0]->ne[2];
+                        cur = 2 * sizeof(float) * n_frames * n_fft;
+                    } break;
                case GGML_OP_TOP_K:
                    {
                        cur += sizeof(int32_t)*node->src[0]->ne[0]*n_tasks;
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@ -10,6 +10,7 @@
 #include <algorithm>
 #include <cfloat>
 #include <cmath>
+#include <vector>

 // ggml_compute_forward_dup

@ -7109,6 +7110,145 @@ void ggml_compute_forward_conv_2d_dw(
    }
 }

+// ggml_compute_forward_istft
+//
+// iSTFT = inverse Short-Time Fourier Transform. Converts a complex spectrogram
+// back to real audio. Three steps per frame:
+//   1. irfft (complex→real via direct inverse DFT, O(n²) per frame)
+//   2. Hann window multiplication
+//   3. Overlap-add (fold) — frames overlap by (win_length - hop_length) samples
+//
+// Threading: each thread irfft's its own frames into a shared scratch buffer
+// (n_frames × n_fft). Thread 0 then does the sequential fold + normalize.
+
+// Direct inverse real FFT using Hermitian symmetry.
+// For a real-valued signal of length n_fft with n_freq = n_fft/2+1 complex bins:
+//   out[j] = (1/n_freq) * (X[0].re
+//            + 2 * sum_{k=1}^{n_freq-2} (X[k].re*cos(2πkj/n_fft) - X[k].im*sin(2πkj/n_fft))
+//            + X[n_freq-1].re*cos(2π(n_freq-1)j/n_fft) - X[n_freq-1].im*sin(2π(n_freq-1)j/n_fft))
+// This matches the Vulkan istft.comp shader algorithm.
+static void ggml_irfft_direct(
+        const float * cplx_in,   // interleaved [re,im] × n_freq
+        float       * out,       // real output, length n_fft
+        int           n_fft,
+        int           n_freq) {
+
+    const float inv_n_freq = 1.0f / (float)n_freq;
+    const float two_pi_over_n = 6.28318530718f / (float)n_fft;
+
+    for (int j = 0; j < n_fft; j++) {
+        const float angle_base = two_pi_over_n * (float)j;
+        float val = 0.0f;
+
+        // DC component (k=0): cos(0)=1, sin(0)=0
+        val += cplx_in[0];
+
+        // Middle frequencies (k=1..n_freq-2): doubled due to Hermitian symmetry
+        for (int k = 1; k < n_freq - 1; k++) {
+            float re = cplx_in[k * 2 + 0];
+            float im = cplx_in[k * 2 + 1];
+            float angle = angle_base * (float)k;
+            val += 2.0f * (re * cosf(angle) - im * sinf(angle));
+        }
+
+        // Nyquist component (k=n_freq-1): appears once
+        {
+            int k = n_freq - 1;
+            float re = cplx_in[k * 2 + 0];
+            float im = cplx_in[k * 2 + 1];
+            float angle = angle_base * (float)k;
+            val += re * cosf(angle) - im * sinf(angle);
+        }
+
+        out[j] = val * inv_n_freq;
+    }
+}
+
+void ggml_compute_forward_istft(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+
+    const ggml_tensor * src = dst->src[0];
+
+    const int32_t n_fft       = dst->op_params[0];
+    const int32_t hop_length  = dst->op_params[1];
+    const int32_t win_length  = dst->op_params[2];
+    const int32_t n_frames    = (int32_t)src->ne[2];
+    const int32_t n_freq      = n_fft / 2 + 1;
+    const int32_t n_pad       = (win_length - hop_length) / 2;
+    const int64_t n_out_full  = (int64_t)(n_frames - 1) * hop_length + win_length;
+    const int64_t n_out       = n_out_full - 2 * n_pad;
+
+    GGML_ASSERT(dst->ne[0] == n_out);
+    GGML_ASSERT(src->ne[0] == 2);        // interleaved re/im
+    GGML_ASSERT(src->ne[1] == n_freq);   // frequency bins
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    // Scratch space from graph planner (requested in ggml-cpu.c extra_work_size):
+    // Layout: [all_res: n_frames * n_fft floats][all_hann2: n_frames * n_fft floats]
+    const int64_t scratch_per_buf = (int64_t)n_frames * n_fft;
+    float * all_res   = (float *)params->wdata;
+    float * all_hann2 = all_res + scratch_per_buf;
+
+    // Thread 0 zeroes the scratch buffers
+    if (ith == 0) {
+        memset(all_res,   0, scratch_per_buf * sizeof(float));
+        memset(all_hann2, 0, scratch_per_buf * sizeof(float));
+    }
+
+    ggml_barrier(params->threadpool);
+
+    // Hann window (each thread computes — cheap, avoids shared state)
+    std::vector<float> hann(win_length);
+    for (int i = 0; i < win_length; i++) {
+        hann[i] = 0.5f * (1.0f - cosf(2.0f * M_PI * i / win_length));
+    }
+
+    const float * src_data = (const float *)src->data;
+    float       * dst_data = (float *)dst->data;
+
+    // Phase 1: each thread irfft's + windows its assigned frames.
+    // Each thread writes to non-overlapping frame indices, so no data race.
+    std::vector<float> frame_real(n_fft);
+
+    for (int l = ith; l < n_frames; l += nth) {
+        const float * frame_cplx = src_data + (int64_t)l * n_freq * 2;
+
+        ggml_irfft_direct(frame_cplx, frame_real.data(), n_fft, n_freq);
+
+        for (int j = 0; j < n_fft; j++) {
+            all_res  [(int64_t)l * n_fft + j] = frame_real[j] * hann[j];
+            all_hann2[(int64_t)l * n_fft + j] = hann[j] * hann[j];
+        }
+    }
+
+    ggml_barrier(params->threadpool);
+
+    // Phase 2: thread 0 does overlap-add (fold) + normalize — sequential
+    if (ith == 0) {
+        std::vector<float> audio(n_out_full, 0.0f);
+        std::vector<float> env(n_out_full, 0.0f);
+
+        for (int l = 0; l < n_frames; l++) {
+            int64_t offset = (int64_t)l * hop_length;
+            for (int j = 0; j < win_length; j++) {
+                int64_t idx = offset + j;
+                if (idx < n_out_full) {
+                    audio[idx] += all_res  [(int64_t)l * n_fft + j];
+                    env[idx]   += all_hann2[(int64_t)l * n_fft + j];
+                }
+            }
+        }
+
+        // normalize by windowed envelope, trim padding
+        for (int64_t i = 0; i < n_out; i++) {
+            dst_data[i] = (env[i + n_pad] > 0.0f) ? audio[i + n_pad] / env[i + n_pad] : 0.0f;
+        }
+    }
+}
+
 // ggml_compute_forward_pool_1d_ksp
 static void ggml_compute_forward_pool_1d_ksp(
        const ggml_compute_params * params,
--- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
@ -71,6 +71,7 @@ void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, str
 void ggml_compute_forward_conv_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_istft(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_pool_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@ -804,6 +804,7 @@ struct vk_device_struct {
    vk_pipeline pipeline_im2col_3d_f32, pipeline_im2col_3d_f32_f16;
    vk_pipeline pipeline_timestep_embedding_f32;
    vk_pipeline pipeline_conv_transpose_1d_f32;
+    vk_pipeline pipeline_istft_f32;
    vk_pipeline pipeline_pool2d_f32;
    vk_pipeline pipeline_rwkv_wkv6_f32;
    vk_pipeline pipeline_rwkv_wkv7_f32;
@ -1407,6 +1408,16 @@ struct vk_op_conv_transpose_1d_push_constants {
    int32_t s0;
 };

+struct vk_op_istft_push_constants {
+    uint32_t n_fft;
+    uint32_t n_freq;
+    uint32_t n_frames;
+    uint32_t hop_length;
+    uint32_t win_length;
+    uint32_t n_pad;
+    uint32_t n_out;
+};
+
 struct vk_op_pool2d_push_constants {
    uint32_t IW; uint32_t IH;
    uint32_t OW; uint32_t OH;
@ -4402,6 +4413,8 @@ static void ggml_vk_load_shaders(vk_device& device) {

    ggml_vk_create_pipeline(device, device->pipeline_conv_transpose_1d_f32, "conv_transpose_1d_f32", conv_transpose_1d_f32_len, conv_transpose_1d_f32_data, "main", 3, sizeof(vk_op_conv_transpose_1d_push_constants), {1, 1, 1}, {}, 1);

+    ggml_vk_create_pipeline(device, device->pipeline_istft_f32, "istft_f32", istft_f32_len, istft_f32_data, "main", 2, sizeof(vk_op_istft_push_constants), {256, 1, 1}, {}, 1);
+
    ggml_vk_create_pipeline(device, device->pipeline_pool2d_f32, "pool2d_f32", pool2d_f32_len, pool2d_f32_data, "main", 2, sizeof(vk_op_pool2d_push_constants), {512, 1, 1}, {}, 1);

    ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv6_f32, "rwkv_wkv6_f32", rwkv_wkv6_f32_len, rwkv_wkv6_f32_data, "main", 7, sizeof(vk_op_rwkv_wkv6_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
@ -9230,6 +9243,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
            return ctx->device->pipeline_conv_transpose_1d_f32;
        }
        return nullptr;
+    case GGML_OP_ISTFT:
+        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_istft_f32;
+        }
+        return nullptr;
    case GGML_OP_POOL_2D:
        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
            return ctx->device->pipeline_pool2d_f32;
@ -9612,6 +9630,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
        {
            elements = {uint32_t(src0->ne[1]), 1, 1}; // parallelize in {Cout, 1, 1}
        } break;
+    case GGML_OP_ISTFT:
+        {
+            elements = {uint32_t(dst->ne[0]), 1, 1};  // one thread per output sample
+        } break;
    case GGML_OP_POOL_2D:
        {
            const uint32_t N = dst->ne[3];
@ -11303,6 +11325,33 @@ static void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context&
    ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_TRANSPOSE_1D, std::move(p));
 }

+static void ggml_vk_istft(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+    // src0: [2, n_freq, n_frames] — complex spectrogram (interleaved re/im)
+    // dst:  [n_out] — real PCM audio
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    const int32_t n_fft       = dst->op_params[0];
+    const int32_t hop_length  = dst->op_params[1];
+    const int32_t win_length  = dst->op_params[2];
+    const int32_t n_freq      = n_fft / 2 + 1;
+    const int32_t n_frames    = (int32_t)src0->ne[2];
+    const int32_t n_pad       = (win_length - hop_length) / 2;
+    const int32_t n_out       = (int32_t)dst->ne[0];
+
+    vk_op_istft_push_constants p{};
+    p.n_fft       = static_cast<uint32_t>(n_fft);
+    p.n_freq      = static_cast<uint32_t>(n_freq);
+    p.n_frames    = static_cast<uint32_t>(n_frames);
+    p.hop_length  = static_cast<uint32_t>(hop_length);
+    p.win_length  = static_cast<uint32_t>(win_length);
+    p.n_pad       = static_cast<uint32_t>(n_pad);
+    p.n_out       = static_cast<uint32_t>(n_out);
+
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_ISTFT, std::move(p));
+}
+
 static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
    uint32_t op = static_cast<uint32_t>(dst->op_params[0]);
    const int32_t k1 = dst->op_params[1];
@ -12754,6 +12803,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
    case GGML_OP_CONV_TRANSPOSE_1D:
        ggml_vk_conv_transpose_1d(ctx, compute_ctx, src0, src1, node);

+        break;
+    case GGML_OP_ISTFT:
+        ggml_vk_istft(ctx, compute_ctx, src0, node);
+
        break;
    case GGML_OP_POOL_2D:
        ggml_vk_pool_2d(ctx, compute_ctx, src0, node);
@ -15076,6 +15129,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
            return op->src[0]->type == GGML_TYPE_F32;
        case GGML_OP_CONV_TRANSPOSE_1D:
            return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
+        case GGML_OP_ISTFT:
+            return op->src[0]->type == GGML_TYPE_F32;
        case GGML_OP_CONV_2D:
        case GGML_OP_CONV_TRANSPOSE_2D:
            {
@ -15836,6 +15891,11 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
        } else if (tensor->op == GGML_OP_CONV_TRANSPOSE_2D) {
            const int32_t s = tensor->op_params[0];
            tensor_clone = ggml_conv_transpose_2d_p0(ggml_ctx, src_clone[0], src_clone[1], s);
+        } else if (tensor->op == GGML_OP_ISTFT) {
+            const int32_t n_fft       = tensor->op_params[0];
+            const int32_t hop_length  = tensor->op_params[1];
+            const int32_t win_length  = tensor->op_params[2];
+            tensor_clone = ggml_istft(ggml_ctx, src_clone[0], n_fft, hop_length, win_length);
        } else if (tensor->op == GGML_OP_LEAKY_RELU) {
            const float * op_params = (const float *)tensor->op_params;
            tensor_clone = ggml_leaky_relu(ggml_ctx, src_clone[0], op_params[0], false);
--- a/ggml/src/ggml-vulkan/vulkan-shaders/istft.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/istft.comp
@ -0,0 +1,95 @@
+#version 450
+
+// iSTFT single-pass compute shader — one thread per output sample
+//
+// Each thread computes one output PCM sample by gathering contributions from
+// all overlapping spectrogram frames. For each frame, it evaluates the
+// inverse DFT at the sample's position within that frame.
+//
+// Complexity per output sample: O(overlap_count * n_freq)
+//   where overlap_count = win_length / hop_length (typically 4)
+//   and n_freq = n_fft/2 + 1 (typically 641)
+//
+// This is O(n^2) per frame but fully parallelized across all output samples.
+// For the TTS workload (~80K output samples), the GPU fills easily.
+
+#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
+
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+layout(binding = 0) readonly  buffer Src { float data_src[]; };  // complex spectrogram [2, n_freq, n_frames]
+layout(binding = 1) writeonly buffer Dst { float data_dst[]; };  // output audio [n_out]
+
+layout(push_constant) uniform PushConstants {
+    uint32_t n_fft;
+    uint32_t n_freq;       // n_fft/2 + 1
+    uint32_t n_frames;
+    uint32_t hop_length;
+    uint32_t win_length;
+    uint32_t n_pad;
+    uint32_t n_out;
+} p;
+
+void main() {
+    uint i = gl_GlobalInvocationID.x;
+    if (i >= p.n_out) return;
+
+    // Map output index to pre-trim position
+    uint i_full = i + p.n_pad;
+
+    float audio_sum = 0.0;
+    float env_sum   = 0.0;
+
+    // Find overlapping frames: frame l covers samples [l*hop .. l*hop + win - 1]
+    // This sample is at i_full, so l*hop <= i_full < l*hop + win
+    int l_min = max(0, int(i_full) - int(p.win_length) + 1);
+    l_min = (l_min + int(p.hop_length) - 1) / int(p.hop_length);  // ceil division
+    int l_max = min(int(p.n_frames) - 1, int(i_full) / int(p.hop_length));
+
+    float inv_n_freq = 1.0 / float(p.n_freq);
+    float two_pi_over_n = 6.28318530718 / float(p.n_fft);
+
+    for (int l = l_min; l <= l_max; l++) {
+        uint frame_start = uint(l) * p.hop_length;
+        uint j = i_full - frame_start;  // position within this frame's window
+
+        // Hann window at position j
+        float hann = 0.5 * (1.0 - cos(6.28318530718 * float(j) / float(p.win_length)));
+
+        // Compute inverse DFT at position j for this frame's spectrum
+        // irfft: out[j] = (1/N) * sum_{k=0}^{N-1} X[k] * exp(i*2*pi*k*j/n_fft)
+        // For real signal, using Hermitian symmetry:
+        // out[j] = (1/n_freq) * (X[0].re + 2*sum_{k=1}^{n_freq-2} (X[k].re*cos - X[k].im*sin) + X[n_freq-1].re * cos)
+        uint src_offset = uint(l) * p.n_freq * 2;
+
+        float val = 0.0;
+        float angle_base = two_pi_over_n * float(j);
+
+        // DC component (k=0)
+        val += data_src[src_offset + 0];  // X[0].real, cos(0)=1
+
+        // Middle frequencies (k=1 to n_freq-2): count twice due to Hermitian symmetry
+        for (uint k = 1; k < p.n_freq - 1; k++) {
+            float re = data_src[src_offset + k * 2 + 0];
+            float im = data_src[src_offset + k * 2 + 1];
+            float angle = angle_base * float(k);
+            val += 2.0 * (re * cos(angle) - im * sin(angle));
+        }
+
+        // Nyquist component (k=n_freq-1)
+        {
+            uint k = p.n_freq - 1;
+            float re = data_src[src_offset + k * 2 + 0];
+            float im = data_src[src_offset + k * 2 + 1];
+            float angle = angle_base * float(k);
+            val += re * cos(angle) - im * sin(angle);
+        }
+
+        val *= inv_n_freq;
+
+        audio_sum += val * hann;
+        env_sum   += hann * hann;
+    }
+
+    data_dst[i] = (env_sum > 0.0) ? audio_sum / env_sum : 0.0;
+}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@ -965,6 +965,8 @@ void process_shaders() {

    string_to_spv("conv_transpose_1d_f32", "conv_transpose_1d.comp", {{"A_TYPE", "float"},  {"B_TYPE", "float"}, {"D_TYPE", "float"}});

+    string_to_spv("istft_f32", "istft.comp", {});
+
    string_to_spv("pool2d_f32", "pool2d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));

    string_to_spv("rwkv_wkv6_f32", "wkv6.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -1003,6 +1003,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "CONV_3D",
    "CONV_2D_DW",
    "CONV_TRANSPOSE_2D",
+    "ISTFT",
    "POOL_1D",
    "POOL_2D",
    "POOL_2D_BACK",
@ -1047,7 +1048,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "GLU",
 };

-static_assert(GGML_OP_COUNT == 95, "GGML_OP_COUNT != 95");
+static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");

 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "none",
@ -1112,6 +1113,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "conv_3d(x)",
    "conv_2d_dw(x)",
    "conv_transpose_2d(x)",
+    "istft(x)",
    "pool_1d(x)",
    "pool_2d(x)",
    "pool_2d_back(x)",
@ -1156,7 +1158,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "glu(x)",
 };

-static_assert(GGML_OP_COUNT == 95, "GGML_OP_COUNT != 95");
+static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");

 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");

@ -4818,6 +4820,34 @@ struct ggml_tensor * ggml_conv_transpose_2d_p0(
    return result;
 }

+// ggml_istft
+
+struct ggml_tensor * ggml_istft(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * spectrogram,
+        int                   n_fft,
+        int                   hop_length,
+        int                   win_length) {
+    // spectrogram: [2, n_fft/2+1, n_frames] — complex interleaved (re/im), freq bins, frames
+    GGML_ASSERT(spectrogram->ne[0] == 2);
+    GGML_ASSERT(spectrogram->ne[1] == n_fft / 2 + 1);
+
+    const int64_t n_frames = spectrogram->ne[2];
+    const int64_t n_pad = (win_length - hop_length) / 2;
+    const int64_t n_out = (n_frames - 1) * hop_length + win_length - 2 * n_pad;
+
+    const int64_t ne[4] = { n_out, 1, 1, 1 };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 1, ne);
+
+    int32_t params[] = { n_fft, hop_length, win_length };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op     = GGML_OP_ISTFT;
+    result->src[0] = spectrogram;
+
+    return result;
+}
+
 // ggml_pool_*

 static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, float p) {
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -4744,6 +4744,40 @@ struct test_conv_transpose_1d : public test_case {
    }
 };

+// GGML_OP_ISTFT
+struct test_istft : public test_case {
+    const int n_fft;
+    const int hop_length;
+    const int win_length;
+    const int n_frames;
+
+    std::string vars() override {
+        return VARS_TO_STR4(n_fft, hop_length, win_length, n_frames);
+    }
+
+    test_istft(int n_fft = 1280, int hop_length = 320,
+               int win_length = 1280, int n_frames = 87)
+        : n_fft(n_fft), hop_length(hop_length),
+          win_length(win_length), n_frames(n_frames) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        int n_freq = n_fft / 2 + 1;
+        // Input: complex spectrogram [2, n_freq, n_frames]
+        ggml_tensor * spec = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 2, n_freq, n_frames);
+        ggml_set_name(spec, "spectrogram");
+
+        ggml_tensor * out = ggml_istft(ctx, spec, n_fft, hop_length, win_length);
+        ggml_set_name(out, "audio");
+
+        return out;
+    }
+
+    // iSTFT involves trig functions — need looser tolerance than default 1e-7
+    double max_nmse_err() override {
+        return 1e-5;
+    }
+};
+
 // GGML_OP_CONV_TRANSPOSE_2D
 struct test_conv_transpose_2d : public test_case {
    const std::array<int64_t, 4> ne_input;
@ -7370,6 +7404,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
    test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));

+    // ISTFT — TTS spectral reconstruction
+    test_cases.emplace_back(new test_istft());                              // default: TTS params
+    test_cases.emplace_back(new test_istft(1280, 320, 1280, 10));           // few frames
+    test_cases.emplace_back(new test_istft(512, 128, 512, 50));             // smaller FFT
+    test_cases.emplace_back(new test_istft(1024, 256, 1024, 30));           // power-of-2 FFT
+
    test_cases.emplace_back(new test_conv_transpose_2d({3, 2, 3, 1}, {2, 2, 1, 3}, 1));
    test_cases.emplace_back(new test_conv_transpose_2d({10, 10, 9, 1}, {3, 3, 1, 9}, 2));
    test_cases.emplace_back(new test_conv_transpose_2d({129, 63, 35, 1}, {3, 3, 48, 35}, 1));
@ -8506,6 +8546,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
    test_cases.emplace_back(new test_conv_transpose_2d({16, 16, 16, 1}, {3, 3, 8, 16}, 1));
    test_cases.emplace_back(new test_conv_transpose_2d({10, 10, 9, 1}, {3, 3, 1, 9}, 2));

+    test_cases.emplace_back(new test_istft(1280, 320, 1280, 87));   // short TTS
+    test_cases.emplace_back(new test_istft(1280, 320, 1280, 712));  // long TTS
+
    test_cases.emplace_back(new test_mean(GGML_TYPE_F32, {256, 256, 3, 1}));


--- a/tools/tts/tts.cpp
+++ b/tools/tts/tts.cpp
@ -5,6 +5,9 @@
 #include "sampling.h"
 #include "log.h"
 #include "llama.h"
+#include "ggml.h"
+#include "ggml-backend.h"
+#include "ggml-alloc.h"

 #define JSON_ASSERT GGML_ASSERT
 #include <nlohmann/json.hpp>
@ -113,168 +116,74 @@ static bool save_wav16(const std::string & fname, const std::vector<float> & dat
    return file.good();
 }

-static void fill_hann_window(int length, bool periodic, float * output) {
-    int offset = -1;
-    if (periodic) {
-        offset = 0;
-    }
-    for (int i = 0; i < length; i++) {
-        output[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
-    }
-}
-
-// very poor-man fft
-static void twiddle(float * real, float * imag, int k, int N) {
-    float angle = 2 * M_PI * k / N;
-    *real = cos(angle);
-    *imag = sin(angle);
-}
-
-static void irfft(int n, const float * inp_cplx, float * out_real) {
-    int N = n / 2 + 1;
-
-    std::vector<float> real_input(N);
-    std::vector<float> imag_input(N);
-    for (int i = 0; i < N; ++i) {
-        real_input[i] = inp_cplx[2 * i];
-        imag_input[i] = inp_cplx[2 * i + 1];
-    }
-
-    std::vector<float> real_output(n);
-    std::vector<float> imag_output(n);
-
-    for (int k = 0; k < n; ++k) {
-        real_output[k] = 0.0f;
-        imag_output[k] = 0.0f;
-        for (int m = 0; m < N; ++m) {
-            float twiddle_real;
-            float twiddle_imag;
-
-            twiddle(&twiddle_real, &twiddle_imag, k * m, n);
-
-            real_output[k] += real_input[m] * twiddle_real - imag_input[m] * twiddle_imag;
-            imag_output[k] += real_input[m] * twiddle_imag + imag_input[m] * twiddle_real;
-        }
-    }
-
-    for (int i = 0; i < n; ++i) {
-        out_real[i] = real_output[i] / N;
-    }
-}
-
-//
-//  y = torch.nn.functional.fold(
-//       data, output_size=(1, output_size), kernel_size=(1, self.win_length), stride=(1, self.hop_length),
-//  )[:, 0, 0, pad:-pad]
-//
-// data.shape =  torch.Size([1, 1280, 261])
-// output_size =  84480
-// win_length =  1280
-// hop_length =  320
-// pad =  480
-//
-static void fold(const std::vector<float> & data, int64_t n_out, int64_t n_win, int64_t n_hop, int64_t n_pad, std::vector<float> & output) {
-    int64_t output_height = n_out;
-    int64_t kernel_w = n_win;
-    int64_t stride_w = n_hop;
-    int64_t width    = n_out;
-
-    output.resize(width, 0.0f);
-
-    int64_t col_idx = 0;
-    for (int64_t w_col = 0; w_col < width; ++w_col) {
-        int64_t start = w_col * stride_w - n_pad;
-        int64_t end   = start + kernel_w;
-
-        for (int64_t w_im = start; w_im < end; ++w_im) {
-            if (w_im >= 0 && w_im < output_height && col_idx < (int64_t) data.size()) {
-                output[w_im] += data[col_idx];
-            }
-            col_idx++;
-        }
-    }
-
-    output.resize(n_out - 2 * n_pad);
-}
-
-// TODO: not optimized at all
+// Spectral ops via ggml_istft — dispatches to Vulkan GPU when available, CPU fallback.
+// Preprocessing (transpose, exp, polar→cartesian) stays in C++ since it's O(n) and fast.
 static std::vector<float> embd_to_audio(
        const float * embd,
        const int n_codes,
        const int n_embd,
        const int n_thread) {
-    const int n_fft = 1280;
-    const int n_hop = 320;
-    const int n_win = 1280;
-    const int n_pad = (n_win - n_hop)/2;
-    const int n_out = (n_codes - 1)*n_hop + n_win;
+    const int n_fft  = 1280;
+    const int n_hop  = 320;
+    const int n_win  = 1280;
+    const int n_freq = n_fft / 2 + 1;  // 641
+    const int n_pad  = (n_win - n_hop) / 2;
+    const int64_t n_out = (int64_t)(n_codes - 1) * n_hop + n_win - 2 * n_pad;

-    std::vector<float> hann(n_fft);
+    GGML_ASSERT(n_embd == 2 * n_freq);  // magnitude + phase

-    fill_hann_window(hann.size(), true, hann.data());
+    // --- Preprocessing: embeddings → complex spectrogram [2, n_freq, n_codes] ---
+    // Layout for ggml_istft: ne[0]=2 (re/im interleaved), ne[1]=n_freq, ne[2]=n_frames
+    std::vector<float> spec(2 * n_freq * n_codes);

-    int n_spec = n_embd*n_codes;
+    for (int l = 0; l < n_codes; l++) {
+        for (int k = 0; k < n_freq; k++) {
+            float mag = embd[l * n_embd + k];
+            float phi = embd[l * n_embd + k + n_freq];

-    std::vector<float> E (n_spec);
-    std::vector<float> S (n_spec);
-    std::vector<float> ST(n_spec);
-
-    for (int l = 0; l < n_codes; ++l) {
-        for (int k = 0; k < n_embd; ++k) {
-            E[k*n_codes + l] = embd[l*n_embd + k];
-        }
-    }
-
-    for (int k = 0; k < n_embd/2; ++k) {
-        for (int l = 0; l < n_codes; ++l) {
-            float mag = E[(k           )*n_codes + l];
-            float phi = E[(k + n_embd/2)*n_codes + l];
-
-            mag = exp(mag);
-
-            if (mag > 1e2) {
-                mag = 1e2;
+            mag = expf(mag);
+            if (mag > 1e2f) {
+                mag = 1e2f;
            }
-            S[2*(k*n_codes + l) + 0] = mag*cosf(phi);
-            S[2*(k*n_codes + l) + 1] = mag*sinf(phi);
+
+            // Store as [2, n_freq, n_codes]: index = (l * n_freq + k) * 2
+            spec[(l * n_freq + k) * 2 + 0] = mag * cosf(phi);
+            spec[(l * n_freq + k) * 2 + 1] = mag * sinf(phi);
        }
    }

-    for (int l = 0; l < n_codes; ++l) {
-        for (int k = 0; k < n_embd/2; ++k) {
-            ST[l*n_embd + 2*k + 0] = S[2*(k*n_codes + l) + 0];
-            ST[l*n_embd + 2*k + 1] = S[2*(k*n_codes + l) + 1];
-        }
-    }
+    // --- Build ggml graph with ggml_istft ---
+    struct ggml_init_params ggml_params = {
+        /*.mem_size   =*/ ggml_tensor_overhead() * 3 + ggml_graph_overhead(),
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ true,
+    };
+    struct ggml_context * ctx = ggml_init(ggml_params);

-    std::vector<float> res  (n_codes*n_fft);
-    std::vector<float> hann2(n_codes*n_fft);
+    struct ggml_tensor * spectrogram = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 2, n_freq, n_codes);
+    struct ggml_tensor * audio_out   = ggml_istft(ctx, spectrogram, n_fft, n_hop, n_win);

-    std::vector<std::thread> workers(n_thread);
-    for (int i = 0; i < n_thread; ++i) {
-        workers[i] = std::thread([&, i]() {
-            for (int l = i; l < n_codes; l += n_thread) {
-                irfft(n_fft, ST.data() + l*n_embd, res.data() + l*n_fft);
-                for (int j = 0; j < n_fft; ++j) {
-                    res  [l*n_fft + j] *= hann[j];
-                    hann2[l*n_fft + j]  = hann[j] * hann[j];
-                }
-            }
-        });
-    }
-    for (int i = 0; i < n_thread; ++i) {
-        workers[i].join();
-    }
+    struct ggml_cgraph * graph = ggml_new_graph(ctx);
+    ggml_build_forward_expand(graph, audio_out);

-    std::vector<float> audio;
-    std::vector<float> env;
+    // --- Allocate and compute on CPU backend ---
+    // When run standalone, uses CPU. When integrated into a full ggml pipeline
+    // with Vulkan, ggml_istft dispatches to the Vulkan compute shader.
+    ggml_backend_t backend = ggml_backend_cpu_init();
+    ggml_backend_cpu_set_n_threads(backend, n_thread);

-    fold(res,   n_out, n_win, n_hop, n_pad, audio);
-    fold(hann2, n_out, n_win, n_hop, n_pad, env); // TODO: can be done once
+    ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend);
+    ggml_backend_tensor_set(spectrogram, spec.data(), 0, spec.size() * sizeof(float));
+    ggml_backend_graph_compute(backend, graph);

-    for (size_t i = 0; i < audio.size(); ++i) {
-        audio[i] /= env[i];
-    }
+    // Read result
+    std::vector<float> audio(n_out);
+    ggml_backend_tensor_get(audio_out, audio.data(), 0, n_out * sizeof(float));
+
+    // Cleanup
+    ggml_backend_buffer_free(buf);
+    ggml_backend_free(backend);
+    ggml_free(ctx);

    return audio;
 }