no more deepstack for audio

This commit is contained in:
Xuan Son Nguyen 2026-04-02 00:36:47 +02:00
parent e0adcf7232
commit 792edb7d7a
1 changed files with 7 additions and 8 deletions

View File

@ -18,12 +18,15 @@ ggml_cgraph * clip_graph_qwen3a::build() {
inp = ggml_add(ctx0, inp, model.conv2d_3_b);
inp = ggml_gelu_erf(ctx0, inp);
// inp is now [time, frames, channels]
// inp [n_pos, n_mels/8, channels, 1] (W, H, C, N)
cb(inp, "after_conv_blocks", -1);
inp = ggml_permute(ctx0, inp, 2, 1, 0, 3); // [channels, frames, time]
inp = ggml_cont(ctx0, inp);
inp = ggml_reshape_2d(ctx0, inp, inp->ne[0] * inp->ne[1], inp->ne[2]); // [channels * time, frames]
const int64_t n_pos_after_conv = inp->ne[0];
const int64_t n_mel_after_conv = inp->ne[1]; // 128/8 = 16
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 3, 1));
inp = ggml_reshape_2d(ctx0, inp, n_pos_after_conv, n_mel_after_conv * inp->ne[3]); // [n_pos, 7680]
inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); // [7680, n_pos]
// project to n_embd
inp = ggml_mul_mat(ctx0, model.conv_out_w, inp);
@ -59,10 +62,6 @@ ggml_cgraph * clip_graph_qwen3a::build() {
cb(cur, "projected", -1);
// pad deepstack if needed
// TODO: do NOT hard code 3 here
cur = ggml_pad(ctx0, cur, cur->ne[0] * 3, 0, 0, 0);
ggml_build_forward_expand(gf, cur);
return gf;