From 792edb7d7a07e5a55c2e651a84f69fe8b46db402 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 2 Apr 2026 00:36:47 +0200
Subject: [PATCH] no more deepstack for audio

---
 tools/mtmd/models/qwen3a.cpp | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/tools/mtmd/models/qwen3a.cpp b/tools/mtmd/models/qwen3a.cpp
index 2680073290..1384e5155e 100644
--- a/tools/mtmd/models/qwen3a.cpp
+++ b/tools/mtmd/models/qwen3a.cpp
@@ -18,12 +18,15 @@ ggml_cgraph * clip_graph_qwen3a::build() {
         inp = ggml_add(ctx0, inp, model.conv2d_3_b);
         inp = ggml_gelu_erf(ctx0, inp);
 
-        // inp is now [time, frames, channels]
+        // inp [n_pos, n_mels/8, channels, 1] (W, H, C, N)
         cb(inp, "after_conv_blocks", -1);
 
-        inp = ggml_permute(ctx0, inp, 2, 1, 0, 3); // [channels, frames, time]
-        inp = ggml_cont(ctx0, inp);
-        inp = ggml_reshape_2d(ctx0, inp, inp->ne[0] * inp->ne[1], inp->ne[2]); // [channels * time, frames]
+        const int64_t n_pos_after_conv = inp->ne[0];
+        const int64_t n_mel_after_conv = inp->ne[1]; // 128/8 = 16
+
+        inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 3, 1));
+        inp = ggml_reshape_2d(ctx0, inp, n_pos_after_conv, n_mel_after_conv * inp->ne[3]); // [n_pos, 7680]
+        inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); // [7680, n_pos]
 
         // project to n_embd
         inp = ggml_mul_mat(ctx0, model.conv_out_w, inp);
@@ -59,10 +62,6 @@ ggml_cgraph * clip_graph_qwen3a::build() {
 
     cb(cur, "projected", -1);
 
-    // pad deepstack if needed
-    // TODO: do NOT hard code 3 here
-    cur = ggml_pad(ctx0, cur, cur->ne[0] * 3, 0, 0, 0);
-
     ggml_build_forward_expand(gf, cur);
 
     return gf;