ggml: add GGML_OP_ISTFT with CPU and Vulkan backends
Adds inverse Short-Time Fourier Transform as a native ggml op, replacing the hand-rolled iSTFT in tools/tts/tts.cpp. CPU backend uses a direct inverse DFT (no external dependencies), Vulkan backend uses a parallel compute shader. Both use Hermitian symmetry for real-valued output. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
c03a5a46f0
commit
27104fc7f3
|
|
@ -529,6 +529,7 @@ extern "C" {
|
|||
GGML_OP_CONV_3D,
|
||||
GGML_OP_CONV_2D_DW,
|
||||
GGML_OP_CONV_TRANSPOSE_2D,
|
||||
GGML_OP_ISTFT,
|
||||
GGML_OP_POOL_1D,
|
||||
GGML_OP_POOL_2D,
|
||||
GGML_OP_POOL_2D_BACK,
|
||||
|
|
@ -2110,6 +2111,17 @@ extern "C" {
|
|||
int n_batch,
|
||||
int n_channels_out);
|
||||
|
||||
// inverse short-time Fourier transform
|
||||
// spectrogram: [n_fft, n_frames, 2] complex (interleaved re/im in last dim)
|
||||
// output: [n_samples] real PCM audio
|
||||
// includes irfft + Hann window + overlap-add
|
||||
GGML_API struct ggml_tensor * ggml_istft(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * spectrogram,
|
||||
int n_fft,
|
||||
int hop_length,
|
||||
int win_length);
|
||||
|
||||
enum ggml_op_pool {
|
||||
GGML_OP_POOL_MAX,
|
||||
GGML_OP_POOL_AVG,
|
||||
|
|
|
|||
|
|
@ -1906,6 +1906,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|||
{
|
||||
ggml_compute_forward_conv_transpose_2d(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_ISTFT:
|
||||
{
|
||||
ggml_compute_forward_istft(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_POOL_1D:
|
||||
{
|
||||
ggml_compute_forward_pool_1d(params, tensor);
|
||||
|
|
@ -2318,6 +2322,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|||
case GGML_OP_CONV_2D_DW:
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||
case GGML_OP_ISTFT:
|
||||
{
|
||||
n_tasks = n_threads;
|
||||
} break;
|
||||
|
|
@ -2863,6 +2868,14 @@ struct ggml_cplan ggml_graph_plan(
|
|||
cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
|
||||
cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
|
||||
} break;
|
||||
case GGML_OP_ISTFT:
|
||||
{
|
||||
// Scratch for irfft results + hann² envelope:
|
||||
// two buffers of n_frames * n_fft floats each
|
||||
const int32_t n_fft = node->op_params[0];
|
||||
const int64_t n_frames = node->src[0]->ne[2];
|
||||
cur = 2 * sizeof(float) * n_frames * n_fft;
|
||||
} break;
|
||||
case GGML_OP_TOP_K:
|
||||
{
|
||||
cur += sizeof(int32_t)*node->src[0]->ne[0]*n_tasks;
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@
|
|||
#include <algorithm>
|
||||
#include <cfloat>
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
|
||||
// ggml_compute_forward_dup
|
||||
|
||||
|
|
@ -7109,6 +7110,145 @@ void ggml_compute_forward_conv_2d_dw(
|
|||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_istft
|
||||
//
|
||||
// iSTFT = inverse Short-Time Fourier Transform. Converts a complex spectrogram
|
||||
// back to real audio. Three steps per frame:
|
||||
// 1. irfft (complex→real via direct inverse DFT, O(n²) per frame)
|
||||
// 2. Hann window multiplication
|
||||
// 3. Overlap-add (fold) — frames overlap by (win_length - hop_length) samples
|
||||
//
|
||||
// Threading: each thread irfft's its own frames into a shared scratch buffer
|
||||
// (n_frames × n_fft). Thread 0 then does the sequential fold + normalize.
|
||||
|
||||
// Direct inverse real FFT using Hermitian symmetry.
|
||||
// For a real-valued signal of length n_fft with n_freq = n_fft/2+1 complex bins:
|
||||
// out[j] = (1/n_freq) * (X[0].re
|
||||
// + 2 * sum_{k=1}^{n_freq-2} (X[k].re*cos(2πkj/n_fft) - X[k].im*sin(2πkj/n_fft))
|
||||
// + X[n_freq-1].re*cos(2π(n_freq-1)j/n_fft) - X[n_freq-1].im*sin(2π(n_freq-1)j/n_fft))
|
||||
// This matches the Vulkan istft.comp shader algorithm.
|
||||
static void ggml_irfft_direct(
|
||||
const float * cplx_in, // interleaved [re,im] × n_freq
|
||||
float * out, // real output, length n_fft
|
||||
int n_fft,
|
||||
int n_freq) {
|
||||
|
||||
const float inv_n_freq = 1.0f / (float)n_freq;
|
||||
const float two_pi_over_n = 6.28318530718f / (float)n_fft;
|
||||
|
||||
for (int j = 0; j < n_fft; j++) {
|
||||
const float angle_base = two_pi_over_n * (float)j;
|
||||
float val = 0.0f;
|
||||
|
||||
// DC component (k=0): cos(0)=1, sin(0)=0
|
||||
val += cplx_in[0];
|
||||
|
||||
// Middle frequencies (k=1..n_freq-2): doubled due to Hermitian symmetry
|
||||
for (int k = 1; k < n_freq - 1; k++) {
|
||||
float re = cplx_in[k * 2 + 0];
|
||||
float im = cplx_in[k * 2 + 1];
|
||||
float angle = angle_base * (float)k;
|
||||
val += 2.0f * (re * cosf(angle) - im * sinf(angle));
|
||||
}
|
||||
|
||||
// Nyquist component (k=n_freq-1): appears once
|
||||
{
|
||||
int k = n_freq - 1;
|
||||
float re = cplx_in[k * 2 + 0];
|
||||
float im = cplx_in[k * 2 + 1];
|
||||
float angle = angle_base * (float)k;
|
||||
val += re * cosf(angle) - im * sinf(angle);
|
||||
}
|
||||
|
||||
out[j] = val * inv_n_freq;
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_compute_forward_istft(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * src = dst->src[0];
|
||||
|
||||
const int32_t n_fft = dst->op_params[0];
|
||||
const int32_t hop_length = dst->op_params[1];
|
||||
const int32_t win_length = dst->op_params[2];
|
||||
const int32_t n_frames = (int32_t)src->ne[2];
|
||||
const int32_t n_freq = n_fft / 2 + 1;
|
||||
const int32_t n_pad = (win_length - hop_length) / 2;
|
||||
const int64_t n_out_full = (int64_t)(n_frames - 1) * hop_length + win_length;
|
||||
const int64_t n_out = n_out_full - 2 * n_pad;
|
||||
|
||||
GGML_ASSERT(dst->ne[0] == n_out);
|
||||
GGML_ASSERT(src->ne[0] == 2); // interleaved re/im
|
||||
GGML_ASSERT(src->ne[1] == n_freq); // frequency bins
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
// Scratch space from graph planner (requested in ggml-cpu.c extra_work_size):
|
||||
// Layout: [all_res: n_frames * n_fft floats][all_hann2: n_frames * n_fft floats]
|
||||
const int64_t scratch_per_buf = (int64_t)n_frames * n_fft;
|
||||
float * all_res = (float *)params->wdata;
|
||||
float * all_hann2 = all_res + scratch_per_buf;
|
||||
|
||||
// Thread 0 zeroes the scratch buffers
|
||||
if (ith == 0) {
|
||||
memset(all_res, 0, scratch_per_buf * sizeof(float));
|
||||
memset(all_hann2, 0, scratch_per_buf * sizeof(float));
|
||||
}
|
||||
|
||||
ggml_barrier(params->threadpool);
|
||||
|
||||
// Hann window (each thread computes — cheap, avoids shared state)
|
||||
std::vector<float> hann(win_length);
|
||||
for (int i = 0; i < win_length; i++) {
|
||||
hann[i] = 0.5f * (1.0f - cosf(2.0f * M_PI * i / win_length));
|
||||
}
|
||||
|
||||
const float * src_data = (const float *)src->data;
|
||||
float * dst_data = (float *)dst->data;
|
||||
|
||||
// Phase 1: each thread irfft's + windows its assigned frames.
|
||||
// Each thread writes to non-overlapping frame indices, so no data race.
|
||||
std::vector<float> frame_real(n_fft);
|
||||
|
||||
for (int l = ith; l < n_frames; l += nth) {
|
||||
const float * frame_cplx = src_data + (int64_t)l * n_freq * 2;
|
||||
|
||||
ggml_irfft_direct(frame_cplx, frame_real.data(), n_fft, n_freq);
|
||||
|
||||
for (int j = 0; j < n_fft; j++) {
|
||||
all_res [(int64_t)l * n_fft + j] = frame_real[j] * hann[j];
|
||||
all_hann2[(int64_t)l * n_fft + j] = hann[j] * hann[j];
|
||||
}
|
||||
}
|
||||
|
||||
ggml_barrier(params->threadpool);
|
||||
|
||||
// Phase 2: thread 0 does overlap-add (fold) + normalize — sequential
|
||||
if (ith == 0) {
|
||||
std::vector<float> audio(n_out_full, 0.0f);
|
||||
std::vector<float> env(n_out_full, 0.0f);
|
||||
|
||||
for (int l = 0; l < n_frames; l++) {
|
||||
int64_t offset = (int64_t)l * hop_length;
|
||||
for (int j = 0; j < win_length; j++) {
|
||||
int64_t idx = offset + j;
|
||||
if (idx < n_out_full) {
|
||||
audio[idx] += all_res [(int64_t)l * n_fft + j];
|
||||
env[idx] += all_hann2[(int64_t)l * n_fft + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// normalize by windowed envelope, trim padding
|
||||
for (int64_t i = 0; i < n_out; i++) {
|
||||
dst_data[i] = (env[i + n_pad] > 0.0f) ? audio[i + n_pad] / env[i + n_pad] : 0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_pool_1d_ksp
|
||||
static void ggml_compute_forward_pool_1d_ksp(
|
||||
const ggml_compute_params * params,
|
||||
|
|
|
|||
|
|
@ -71,6 +71,7 @@ void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, str
|
|||
void ggml_compute_forward_conv_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_istft(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_pool_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
|
|
|
|||
|
|
@ -804,6 +804,7 @@ struct vk_device_struct {
|
|||
vk_pipeline pipeline_im2col_3d_f32, pipeline_im2col_3d_f32_f16;
|
||||
vk_pipeline pipeline_timestep_embedding_f32;
|
||||
vk_pipeline pipeline_conv_transpose_1d_f32;
|
||||
vk_pipeline pipeline_istft_f32;
|
||||
vk_pipeline pipeline_pool2d_f32;
|
||||
vk_pipeline pipeline_rwkv_wkv6_f32;
|
||||
vk_pipeline pipeline_rwkv_wkv7_f32;
|
||||
|
|
@ -1407,6 +1408,16 @@ struct vk_op_conv_transpose_1d_push_constants {
|
|||
int32_t s0;
|
||||
};
|
||||
|
||||
struct vk_op_istft_push_constants {
|
||||
uint32_t n_fft;
|
||||
uint32_t n_freq;
|
||||
uint32_t n_frames;
|
||||
uint32_t hop_length;
|
||||
uint32_t win_length;
|
||||
uint32_t n_pad;
|
||||
uint32_t n_out;
|
||||
};
|
||||
|
||||
struct vk_op_pool2d_push_constants {
|
||||
uint32_t IW; uint32_t IH;
|
||||
uint32_t OW; uint32_t OH;
|
||||
|
|
@ -4402,6 +4413,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_conv_transpose_1d_f32, "conv_transpose_1d_f32", conv_transpose_1d_f32_len, conv_transpose_1d_f32_data, "main", 3, sizeof(vk_op_conv_transpose_1d_push_constants), {1, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_istft_f32, "istft_f32", istft_f32_len, istft_f32_data, "main", 2, sizeof(vk_op_istft_push_constants), {256, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_pool2d_f32, "pool2d_f32", pool2d_f32_len, pool2d_f32_data, "main", 2, sizeof(vk_op_pool2d_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv6_f32, "rwkv_wkv6_f32", rwkv_wkv6_f32_len, rwkv_wkv6_f32_data, "main", 7, sizeof(vk_op_rwkv_wkv6_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
|
||||
|
|
@ -9230,6 +9243,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|||
return ctx->device->pipeline_conv_transpose_1d_f32;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_ISTFT:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_istft_f32;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_POOL_2D:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_pool2d_f32;
|
||||
|
|
@ -9612,6 +9630,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|||
{
|
||||
elements = {uint32_t(src0->ne[1]), 1, 1}; // parallelize in {Cout, 1, 1}
|
||||
} break;
|
||||
case GGML_OP_ISTFT:
|
||||
{
|
||||
elements = {uint32_t(dst->ne[0]), 1, 1}; // one thread per output sample
|
||||
} break;
|
||||
case GGML_OP_POOL_2D:
|
||||
{
|
||||
const uint32_t N = dst->ne[3];
|
||||
|
|
@ -11303,6 +11325,33 @@ static void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context&
|
|||
ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_TRANSPOSE_1D, std::move(p));
|
||||
}
|
||||
|
||||
static void ggml_vk_istft(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
// src0: [2, n_freq, n_frames] — complex spectrogram (interleaved re/im)
|
||||
// dst: [n_out] — real PCM audio
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
const int32_t n_fft = dst->op_params[0];
|
||||
const int32_t hop_length = dst->op_params[1];
|
||||
const int32_t win_length = dst->op_params[2];
|
||||
const int32_t n_freq = n_fft / 2 + 1;
|
||||
const int32_t n_frames = (int32_t)src0->ne[2];
|
||||
const int32_t n_pad = (win_length - hop_length) / 2;
|
||||
const int32_t n_out = (int32_t)dst->ne[0];
|
||||
|
||||
vk_op_istft_push_constants p{};
|
||||
p.n_fft = static_cast<uint32_t>(n_fft);
|
||||
p.n_freq = static_cast<uint32_t>(n_freq);
|
||||
p.n_frames = static_cast<uint32_t>(n_frames);
|
||||
p.hop_length = static_cast<uint32_t>(hop_length);
|
||||
p.win_length = static_cast<uint32_t>(win_length);
|
||||
p.n_pad = static_cast<uint32_t>(n_pad);
|
||||
p.n_out = static_cast<uint32_t>(n_out);
|
||||
|
||||
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_ISTFT, std::move(p));
|
||||
}
|
||||
|
||||
static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
uint32_t op = static_cast<uint32_t>(dst->op_params[0]);
|
||||
const int32_t k1 = dst->op_params[1];
|
||||
|
|
@ -12754,6 +12803,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
|||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
ggml_vk_conv_transpose_1d(ctx, compute_ctx, src0, src1, node);
|
||||
|
||||
break;
|
||||
case GGML_OP_ISTFT:
|
||||
ggml_vk_istft(ctx, compute_ctx, src0, node);
|
||||
|
||||
break;
|
||||
case GGML_OP_POOL_2D:
|
||||
ggml_vk_pool_2d(ctx, compute_ctx, src0, node);
|
||||
|
|
@ -15076,6 +15129,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
|||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_ISTFT:
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_CONV_2D:
|
||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||
{
|
||||
|
|
@ -15836,6 +15891,11 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
|
|||
} else if (tensor->op == GGML_OP_CONV_TRANSPOSE_2D) {
|
||||
const int32_t s = tensor->op_params[0];
|
||||
tensor_clone = ggml_conv_transpose_2d_p0(ggml_ctx, src_clone[0], src_clone[1], s);
|
||||
} else if (tensor->op == GGML_OP_ISTFT) {
|
||||
const int32_t n_fft = tensor->op_params[0];
|
||||
const int32_t hop_length = tensor->op_params[1];
|
||||
const int32_t win_length = tensor->op_params[2];
|
||||
tensor_clone = ggml_istft(ggml_ctx, src_clone[0], n_fft, hop_length, win_length);
|
||||
} else if (tensor->op == GGML_OP_LEAKY_RELU) {
|
||||
const float * op_params = (const float *)tensor->op_params;
|
||||
tensor_clone = ggml_leaky_relu(ggml_ctx, src_clone[0], op_params[0], false);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,95 @@
|
|||
#version 450
|
||||
|
||||
// iSTFT single-pass compute shader — one thread per output sample
|
||||
//
|
||||
// Each thread computes one output PCM sample by gathering contributions from
|
||||
// all overlapping spectrogram frames. For each frame, it evaluates the
|
||||
// inverse DFT at the sample's position within that frame.
|
||||
//
|
||||
// Complexity per output sample: O(overlap_count * n_freq)
|
||||
// where overlap_count = win_length / hop_length (typically 4)
|
||||
// and n_freq = n_fft/2 + 1 (typically 641)
|
||||
//
|
||||
// This is O(n^2) per frame but fully parallelized across all output samples.
|
||||
// For the TTS workload (~80K output samples), the GPU fills easily.
|
||||
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
|
||||
|
||||
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout(binding = 0) readonly buffer Src { float data_src[]; }; // complex spectrogram [2, n_freq, n_frames]
|
||||
layout(binding = 1) writeonly buffer Dst { float data_dst[]; }; // output audio [n_out]
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint32_t n_fft;
|
||||
uint32_t n_freq; // n_fft/2 + 1
|
||||
uint32_t n_frames;
|
||||
uint32_t hop_length;
|
||||
uint32_t win_length;
|
||||
uint32_t n_pad;
|
||||
uint32_t n_out;
|
||||
} p;
|
||||
|
||||
void main() {
|
||||
uint i = gl_GlobalInvocationID.x;
|
||||
if (i >= p.n_out) return;
|
||||
|
||||
// Map output index to pre-trim position
|
||||
uint i_full = i + p.n_pad;
|
||||
|
||||
float audio_sum = 0.0;
|
||||
float env_sum = 0.0;
|
||||
|
||||
// Find overlapping frames: frame l covers samples [l*hop .. l*hop + win - 1]
|
||||
// This sample is at i_full, so l*hop <= i_full < l*hop + win
|
||||
int l_min = max(0, int(i_full) - int(p.win_length) + 1);
|
||||
l_min = (l_min + int(p.hop_length) - 1) / int(p.hop_length); // ceil division
|
||||
int l_max = min(int(p.n_frames) - 1, int(i_full) / int(p.hop_length));
|
||||
|
||||
float inv_n_freq = 1.0 / float(p.n_freq);
|
||||
float two_pi_over_n = 6.28318530718 / float(p.n_fft);
|
||||
|
||||
for (int l = l_min; l <= l_max; l++) {
|
||||
uint frame_start = uint(l) * p.hop_length;
|
||||
uint j = i_full - frame_start; // position within this frame's window
|
||||
|
||||
// Hann window at position j
|
||||
float hann = 0.5 * (1.0 - cos(6.28318530718 * float(j) / float(p.win_length)));
|
||||
|
||||
// Compute inverse DFT at position j for this frame's spectrum
|
||||
// irfft: out[j] = (1/N) * sum_{k=0}^{N-1} X[k] * exp(i*2*pi*k*j/n_fft)
|
||||
// For real signal, using Hermitian symmetry:
|
||||
// out[j] = (1/n_freq) * (X[0].re + 2*sum_{k=1}^{n_freq-2} (X[k].re*cos - X[k].im*sin) + X[n_freq-1].re * cos)
|
||||
uint src_offset = uint(l) * p.n_freq * 2;
|
||||
|
||||
float val = 0.0;
|
||||
float angle_base = two_pi_over_n * float(j);
|
||||
|
||||
// DC component (k=0)
|
||||
val += data_src[src_offset + 0]; // X[0].real, cos(0)=1
|
||||
|
||||
// Middle frequencies (k=1 to n_freq-2): count twice due to Hermitian symmetry
|
||||
for (uint k = 1; k < p.n_freq - 1; k++) {
|
||||
float re = data_src[src_offset + k * 2 + 0];
|
||||
float im = data_src[src_offset + k * 2 + 1];
|
||||
float angle = angle_base * float(k);
|
||||
val += 2.0 * (re * cos(angle) - im * sin(angle));
|
||||
}
|
||||
|
||||
// Nyquist component (k=n_freq-1)
|
||||
{
|
||||
uint k = p.n_freq - 1;
|
||||
float re = data_src[src_offset + k * 2 + 0];
|
||||
float im = data_src[src_offset + k * 2 + 1];
|
||||
float angle = angle_base * float(k);
|
||||
val += re * cos(angle) - im * sin(angle);
|
||||
}
|
||||
|
||||
val *= inv_n_freq;
|
||||
|
||||
audio_sum += val * hann;
|
||||
env_sum += hann * hann;
|
||||
}
|
||||
|
||||
data_dst[i] = (env_sum > 0.0) ? audio_sum / env_sum : 0.0;
|
||||
}
|
||||
|
|
@ -965,6 +965,8 @@ void process_shaders() {
|
|||
|
||||
string_to_spv("conv_transpose_1d_f32", "conv_transpose_1d.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
|
||||
string_to_spv("istft_f32", "istft.comp", {});
|
||||
|
||||
string_to_spv("pool2d_f32", "pool2d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
|
||||
string_to_spv("rwkv_wkv6_f32", "wkv6.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
|
||||
|
|
|
|||
|
|
@ -1003,6 +1003,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|||
"CONV_3D",
|
||||
"CONV_2D_DW",
|
||||
"CONV_TRANSPOSE_2D",
|
||||
"ISTFT",
|
||||
"POOL_1D",
|
||||
"POOL_2D",
|
||||
"POOL_2D_BACK",
|
||||
|
|
@ -1047,7 +1048,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|||
"GLU",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 95, "GGML_OP_COUNT != 95");
|
||||
static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
|
||||
|
||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"none",
|
||||
|
|
@ -1112,6 +1113,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|||
"conv_3d(x)",
|
||||
"conv_2d_dw(x)",
|
||||
"conv_transpose_2d(x)",
|
||||
"istft(x)",
|
||||
"pool_1d(x)",
|
||||
"pool_2d(x)",
|
||||
"pool_2d_back(x)",
|
||||
|
|
@ -1156,7 +1158,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|||
"glu(x)",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 95, "GGML_OP_COUNT != 95");
|
||||
static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
|
||||
|
||||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||
|
||||
|
|
@ -4818,6 +4820,34 @@ struct ggml_tensor * ggml_conv_transpose_2d_p0(
|
|||
return result;
|
||||
}
|
||||
|
||||
// ggml_istft
|
||||
|
||||
struct ggml_tensor * ggml_istft(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * spectrogram,
|
||||
int n_fft,
|
||||
int hop_length,
|
||||
int win_length) {
|
||||
// spectrogram: [2, n_fft/2+1, n_frames] — complex interleaved (re/im), freq bins, frames
|
||||
GGML_ASSERT(spectrogram->ne[0] == 2);
|
||||
GGML_ASSERT(spectrogram->ne[1] == n_fft / 2 + 1);
|
||||
|
||||
const int64_t n_frames = spectrogram->ne[2];
|
||||
const int64_t n_pad = (win_length - hop_length) / 2;
|
||||
const int64_t n_out = (n_frames - 1) * hop_length + win_length - 2 * n_pad;
|
||||
|
||||
const int64_t ne[4] = { n_out, 1, 1, 1 };
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 1, ne);
|
||||
|
||||
int32_t params[] = { n_fft, hop_length, win_length };
|
||||
ggml_set_op_params(result, params, sizeof(params));
|
||||
|
||||
result->op = GGML_OP_ISTFT;
|
||||
result->src[0] = spectrogram;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_pool_*
|
||||
|
||||
static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, float p) {
|
||||
|
|
|
|||
|
|
@ -4744,6 +4744,40 @@ struct test_conv_transpose_1d : public test_case {
|
|||
}
|
||||
};
|
||||
|
||||
// GGML_OP_ISTFT
|
||||
struct test_istft : public test_case {
|
||||
const int n_fft;
|
||||
const int hop_length;
|
||||
const int win_length;
|
||||
const int n_frames;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR4(n_fft, hop_length, win_length, n_frames);
|
||||
}
|
||||
|
||||
test_istft(int n_fft = 1280, int hop_length = 320,
|
||||
int win_length = 1280, int n_frames = 87)
|
||||
: n_fft(n_fft), hop_length(hop_length),
|
||||
win_length(win_length), n_frames(n_frames) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
int n_freq = n_fft / 2 + 1;
|
||||
// Input: complex spectrogram [2, n_freq, n_frames]
|
||||
ggml_tensor * spec = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 2, n_freq, n_frames);
|
||||
ggml_set_name(spec, "spectrogram");
|
||||
|
||||
ggml_tensor * out = ggml_istft(ctx, spec, n_fft, hop_length, win_length);
|
||||
ggml_set_name(out, "audio");
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
// iSTFT involves trig functions — need looser tolerance than default 1e-7
|
||||
double max_nmse_err() override {
|
||||
return 1e-5;
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_CONV_TRANSPOSE_2D
|
||||
struct test_conv_transpose_2d : public test_case {
|
||||
const std::array<int64_t, 4> ne_input;
|
||||
|
|
@ -7370,6 +7404,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
|||
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
|
||||
test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
|
||||
|
||||
// ISTFT — TTS spectral reconstruction
|
||||
test_cases.emplace_back(new test_istft()); // default: TTS params
|
||||
test_cases.emplace_back(new test_istft(1280, 320, 1280, 10)); // few frames
|
||||
test_cases.emplace_back(new test_istft(512, 128, 512, 50)); // smaller FFT
|
||||
test_cases.emplace_back(new test_istft(1024, 256, 1024, 30)); // power-of-2 FFT
|
||||
|
||||
test_cases.emplace_back(new test_conv_transpose_2d({3, 2, 3, 1}, {2, 2, 1, 3}, 1));
|
||||
test_cases.emplace_back(new test_conv_transpose_2d({10, 10, 9, 1}, {3, 3, 1, 9}, 2));
|
||||
test_cases.emplace_back(new test_conv_transpose_2d({129, 63, 35, 1}, {3, 3, 48, 35}, 1));
|
||||
|
|
@ -8506,6 +8546,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
|||
test_cases.emplace_back(new test_conv_transpose_2d({16, 16, 16, 1}, {3, 3, 8, 16}, 1));
|
||||
test_cases.emplace_back(new test_conv_transpose_2d({10, 10, 9, 1}, {3, 3, 1, 9}, 2));
|
||||
|
||||
test_cases.emplace_back(new test_istft(1280, 320, 1280, 87)); // short TTS
|
||||
test_cases.emplace_back(new test_istft(1280, 320, 1280, 712)); // long TTS
|
||||
|
||||
test_cases.emplace_back(new test_mean(GGML_TYPE_F32, {256, 256, 3, 1}));
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -5,6 +5,9 @@
|
|||
#include "sampling.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-alloc.h"
|
||||
|
||||
#define JSON_ASSERT GGML_ASSERT
|
||||
#include <nlohmann/json.hpp>
|
||||
|
|
@ -113,168 +116,74 @@ static bool save_wav16(const std::string & fname, const std::vector<float> & dat
|
|||
return file.good();
|
||||
}
|
||||
|
||||
static void fill_hann_window(int length, bool periodic, float * output) {
|
||||
int offset = -1;
|
||||
if (periodic) {
|
||||
offset = 0;
|
||||
}
|
||||
for (int i = 0; i < length; i++) {
|
||||
output[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
|
||||
}
|
||||
}
|
||||
|
||||
// very poor-man fft
|
||||
static void twiddle(float * real, float * imag, int k, int N) {
|
||||
float angle = 2 * M_PI * k / N;
|
||||
*real = cos(angle);
|
||||
*imag = sin(angle);
|
||||
}
|
||||
|
||||
static void irfft(int n, const float * inp_cplx, float * out_real) {
|
||||
int N = n / 2 + 1;
|
||||
|
||||
std::vector<float> real_input(N);
|
||||
std::vector<float> imag_input(N);
|
||||
for (int i = 0; i < N; ++i) {
|
||||
real_input[i] = inp_cplx[2 * i];
|
||||
imag_input[i] = inp_cplx[2 * i + 1];
|
||||
}
|
||||
|
||||
std::vector<float> real_output(n);
|
||||
std::vector<float> imag_output(n);
|
||||
|
||||
for (int k = 0; k < n; ++k) {
|
||||
real_output[k] = 0.0f;
|
||||
imag_output[k] = 0.0f;
|
||||
for (int m = 0; m < N; ++m) {
|
||||
float twiddle_real;
|
||||
float twiddle_imag;
|
||||
|
||||
twiddle(&twiddle_real, &twiddle_imag, k * m, n);
|
||||
|
||||
real_output[k] += real_input[m] * twiddle_real - imag_input[m] * twiddle_imag;
|
||||
imag_output[k] += real_input[m] * twiddle_imag + imag_input[m] * twiddle_real;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < n; ++i) {
|
||||
out_real[i] = real_output[i] / N;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// y = torch.nn.functional.fold(
|
||||
// data, output_size=(1, output_size), kernel_size=(1, self.win_length), stride=(1, self.hop_length),
|
||||
// )[:, 0, 0, pad:-pad]
|
||||
//
|
||||
// data.shape = torch.Size([1, 1280, 261])
|
||||
// output_size = 84480
|
||||
// win_length = 1280
|
||||
// hop_length = 320
|
||||
// pad = 480
|
||||
//
|
||||
static void fold(const std::vector<float> & data, int64_t n_out, int64_t n_win, int64_t n_hop, int64_t n_pad, std::vector<float> & output) {
|
||||
int64_t output_height = n_out;
|
||||
int64_t kernel_w = n_win;
|
||||
int64_t stride_w = n_hop;
|
||||
int64_t width = n_out;
|
||||
|
||||
output.resize(width, 0.0f);
|
||||
|
||||
int64_t col_idx = 0;
|
||||
for (int64_t w_col = 0; w_col < width; ++w_col) {
|
||||
int64_t start = w_col * stride_w - n_pad;
|
||||
int64_t end = start + kernel_w;
|
||||
|
||||
for (int64_t w_im = start; w_im < end; ++w_im) {
|
||||
if (w_im >= 0 && w_im < output_height && col_idx < (int64_t) data.size()) {
|
||||
output[w_im] += data[col_idx];
|
||||
}
|
||||
col_idx++;
|
||||
}
|
||||
}
|
||||
|
||||
output.resize(n_out - 2 * n_pad);
|
||||
}
|
||||
|
||||
// TODO: not optimized at all
|
||||
// Spectral ops via ggml_istft — dispatches to Vulkan GPU when available, CPU fallback.
|
||||
// Preprocessing (transpose, exp, polar→cartesian) stays in C++ since it's O(n) and fast.
|
||||
static std::vector<float> embd_to_audio(
|
||||
const float * embd,
|
||||
const int n_codes,
|
||||
const int n_embd,
|
||||
const int n_thread) {
|
||||
const int n_fft = 1280;
|
||||
const int n_hop = 320;
|
||||
const int n_win = 1280;
|
||||
const int n_pad = (n_win - n_hop)/2;
|
||||
const int n_out = (n_codes - 1)*n_hop + n_win;
|
||||
const int n_fft = 1280;
|
||||
const int n_hop = 320;
|
||||
const int n_win = 1280;
|
||||
const int n_freq = n_fft / 2 + 1; // 641
|
||||
const int n_pad = (n_win - n_hop) / 2;
|
||||
const int64_t n_out = (int64_t)(n_codes - 1) * n_hop + n_win - 2 * n_pad;
|
||||
|
||||
std::vector<float> hann(n_fft);
|
||||
GGML_ASSERT(n_embd == 2 * n_freq); // magnitude + phase
|
||||
|
||||
fill_hann_window(hann.size(), true, hann.data());
|
||||
// --- Preprocessing: embeddings → complex spectrogram [2, n_freq, n_codes] ---
|
||||
// Layout for ggml_istft: ne[0]=2 (re/im interleaved), ne[1]=n_freq, ne[2]=n_frames
|
||||
std::vector<float> spec(2 * n_freq * n_codes);
|
||||
|
||||
int n_spec = n_embd*n_codes;
|
||||
for (int l = 0; l < n_codes; l++) {
|
||||
for (int k = 0; k < n_freq; k++) {
|
||||
float mag = embd[l * n_embd + k];
|
||||
float phi = embd[l * n_embd + k + n_freq];
|
||||
|
||||
std::vector<float> E (n_spec);
|
||||
std::vector<float> S (n_spec);
|
||||
std::vector<float> ST(n_spec);
|
||||
|
||||
for (int l = 0; l < n_codes; ++l) {
|
||||
for (int k = 0; k < n_embd; ++k) {
|
||||
E[k*n_codes + l] = embd[l*n_embd + k];
|
||||
}
|
||||
}
|
||||
|
||||
for (int k = 0; k < n_embd/2; ++k) {
|
||||
for (int l = 0; l < n_codes; ++l) {
|
||||
float mag = E[(k )*n_codes + l];
|
||||
float phi = E[(k + n_embd/2)*n_codes + l];
|
||||
|
||||
mag = exp(mag);
|
||||
|
||||
if (mag > 1e2) {
|
||||
mag = 1e2;
|
||||
mag = expf(mag);
|
||||
if (mag > 1e2f) {
|
||||
mag = 1e2f;
|
||||
}
|
||||
S[2*(k*n_codes + l) + 0] = mag*cosf(phi);
|
||||
S[2*(k*n_codes + l) + 1] = mag*sinf(phi);
|
||||
|
||||
// Store as [2, n_freq, n_codes]: index = (l * n_freq + k) * 2
|
||||
spec[(l * n_freq + k) * 2 + 0] = mag * cosf(phi);
|
||||
spec[(l * n_freq + k) * 2 + 1] = mag * sinf(phi);
|
||||
}
|
||||
}
|
||||
|
||||
for (int l = 0; l < n_codes; ++l) {
|
||||
for (int k = 0; k < n_embd/2; ++k) {
|
||||
ST[l*n_embd + 2*k + 0] = S[2*(k*n_codes + l) + 0];
|
||||
ST[l*n_embd + 2*k + 1] = S[2*(k*n_codes + l) + 1];
|
||||
}
|
||||
}
|
||||
// --- Build ggml graph with ggml_istft ---
|
||||
struct ggml_init_params ggml_params = {
|
||||
/*.mem_size =*/ ggml_tensor_overhead() * 3 + ggml_graph_overhead(),
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
struct ggml_context * ctx = ggml_init(ggml_params);
|
||||
|
||||
std::vector<float> res (n_codes*n_fft);
|
||||
std::vector<float> hann2(n_codes*n_fft);
|
||||
struct ggml_tensor * spectrogram = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 2, n_freq, n_codes);
|
||||
struct ggml_tensor * audio_out = ggml_istft(ctx, spectrogram, n_fft, n_hop, n_win);
|
||||
|
||||
std::vector<std::thread> workers(n_thread);
|
||||
for (int i = 0; i < n_thread; ++i) {
|
||||
workers[i] = std::thread([&, i]() {
|
||||
for (int l = i; l < n_codes; l += n_thread) {
|
||||
irfft(n_fft, ST.data() + l*n_embd, res.data() + l*n_fft);
|
||||
for (int j = 0; j < n_fft; ++j) {
|
||||
res [l*n_fft + j] *= hann[j];
|
||||
hann2[l*n_fft + j] = hann[j] * hann[j];
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
for (int i = 0; i < n_thread; ++i) {
|
||||
workers[i].join();
|
||||
}
|
||||
struct ggml_cgraph * graph = ggml_new_graph(ctx);
|
||||
ggml_build_forward_expand(graph, audio_out);
|
||||
|
||||
std::vector<float> audio;
|
||||
std::vector<float> env;
|
||||
// --- Allocate and compute on CPU backend ---
|
||||
// When run standalone, uses CPU. When integrated into a full ggml pipeline
|
||||
// with Vulkan, ggml_istft dispatches to the Vulkan compute shader.
|
||||
ggml_backend_t backend = ggml_backend_cpu_init();
|
||||
ggml_backend_cpu_set_n_threads(backend, n_thread);
|
||||
|
||||
fold(res, n_out, n_win, n_hop, n_pad, audio);
|
||||
fold(hann2, n_out, n_win, n_hop, n_pad, env); // TODO: can be done once
|
||||
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend);
|
||||
ggml_backend_tensor_set(spectrogram, spec.data(), 0, spec.size() * sizeof(float));
|
||||
ggml_backend_graph_compute(backend, graph);
|
||||
|
||||
for (size_t i = 0; i < audio.size(); ++i) {
|
||||
audio[i] /= env[i];
|
||||
}
|
||||
// Read result
|
||||
std::vector<float> audio(n_out);
|
||||
ggml_backend_tensor_get(audio_out, audio.data(), 0, n_out * sizeof(float));
|
||||
|
||||
// Cleanup
|
||||
ggml_backend_buffer_free(buf);
|
||||
ggml_backend_free(backend);
|
||||
ggml_free(ctx);
|
||||
|
||||
return audio;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue