diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 395d0d37ba..78bc36bbee 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4515,12 +4515,15 @@ class Qwen3OmniMmprojModel(Qwen3VLVisionModel, Qwen25AudioModel): name = name.replace(".ln_q", ".norm") name = name.replace(".mlp.0", ".linear_fc1") name = name.replace(".mlp.2", ".linear_fc2") - if ".merger." in name: + elif ".merger." in name: name = name.replace(".ln_q", ".norm") name = name.replace(".mlp.0", ".linear_fc1") name = name.replace(".mlp.2", ".linear_fc2") return Qwen3VLVisionModel.modify_tensors(self, data_torch, name, bid) elif "audio_tower." in name: + if "conv2d" in name and name.endswith(".bias"): + # transform conv2d bias [n_embd] --> [1, 1, n_embd] + data_torch = data_torch.unsqueeze(-1).unsqueeze(-1) return Qwen25AudioModel.modify_tensors(self, data_torch, name, bid) return [] @@ -4555,9 +4558,10 @@ class Qwen3VLTextModel(Qwen3Model): def set_gguf_parameters(self): super().set_gguf_parameters() - - # Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL - vision_config = self.hparams.get("vision_config", {}) + if "thinker_config" in self.hparams: + vision_config = self.hparams["thinker_config"].get("vision_config", {}) + else: + vision_config = self.hparams.get("vision_config", {}) deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", [])) self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num) @@ -4575,7 +4579,10 @@ class Qwen3VLMoeTextModel(Qwen3MoeModel): def set_gguf_parameters(self): super().set_gguf_parameters() - vision_config = self.hparams.get("vision_config", {}) + if "thinker_config" in self.hparams: + vision_config = self.hparams["thinker_config"].get("vision_config", {}) + else: + vision_config = self.hparams.get("vision_config", {}) deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", [])) self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 1599aa15d9..190d4e353e 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -700,6 +700,7 @@ class MODEL_TENSOR(IntEnum): A_ENC_EMBD_NORM = auto() A_ENC_EMBD_TO_LOGITS = auto() A_ENC_CONV1D = auto() + A_ENC_CONV2D = auto() A_ENC_CONV_OUT = auto() A_PRE_NORM = auto() A_POST_NORM = auto() @@ -1098,6 +1099,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.A_ENC_EMBD_NORM: "a.position_embd_norm", MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS: "a.embd_to_logits", MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}", + MODEL_TENSOR.A_ENC_CONV2D: "a.conv2d.{bid}", MODEL_TENSOR.A_ENC_CONV_OUT: "a.conv_out", MODEL_TENSOR.A_PRE_NORM: "a.pre_ln", MODEL_TENSOR.A_POST_NORM: "a.post_ln", @@ -1196,6 +1198,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.A_ENC_EMBD_NORM, MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS, MODEL_TENSOR.A_ENC_CONV1D, + MODEL_TENSOR.A_ENC_CONV2D, MODEL_TENSOR.A_ENC_CONV_OUT, MODEL_TENSOR.A_PRE_NORM, MODEL_TENSOR.A_POST_NORM, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index aeb6843c3e..c4d0e45d54 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1563,6 +1563,9 @@ class TensorNameMap: MODEL_TENSOR.A_ENC_CONV1D: ( "audio_tower.conv{bid}", # ultravox "conformer.pre_encode.conv.{bid}", # lfm2 + ), + + MODEL_TENSOR.A_ENC_CONV2D: ( "audio_tower.conv2d{bid}", # qwen3omni ), diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index 317d5f19fd..a8773e1124 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -27,6 +27,7 @@ add_library(mtmd models/qwen3vl.cpp models/siglip.cpp models/whisper-enc.cpp + models/qwen3a.cpp ) set_target_properties(mtmd PROPERTIES diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index f77ec73f4a..a6fad4cbe9 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -125,6 +125,7 @@ // ultravox #define TN_CONV1D "a.conv1d.%d.%s" +#define TN_CONV2D "a.conv2d.%d.%s" #define TN_CONV_OUT "a.conv_out.%s" #define TN_MM_AUDIO_MLP "mm.a.mlp.%d.%s" #define TN_MM_AUDIO_FC "mm.a.fc.%s" // fully connected layer diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index 2c4c1547a2..41f364e5d0 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -304,6 +304,14 @@ struct clip_model { ggml_tensor * mm_norm_pre_b = nullptr; ggml_tensor * mm_norm_mid_w = nullptr; + // qwen3a + ggml_tensor * conv2d_1_w = nullptr; + ggml_tensor * conv2d_1_b = nullptr; + ggml_tensor * conv2d_2_w = nullptr; + ggml_tensor * conv2d_2_b = nullptr; + ggml_tensor * conv2d_3_w = nullptr; + ggml_tensor * conv2d_3_b = nullptr; + // cogvlm ggml_tensor * mm_post_fc_norm_w = nullptr; ggml_tensor * mm_post_fc_norm_b = nullptr; diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index af926c6da0..da7f320ffb 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -817,7 +817,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 case PROJECTOR_TYPE_ULTRAVOX: case PROJECTOR_TYPE_VOXTRAL: case PROJECTOR_TYPE_QWEN2A: - case PROJECTOR_TYPE_QWEN3A: case PROJECTOR_TYPE_GLMA: case PROJECTOR_TYPE_MUSIC_FLAMINGO: { @@ -847,6 +846,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { builder = std::make_unique(ctx, img); } break; + case PROJECTOR_TYPE_QWEN3A: + { + builder = std::make_unique(ctx, img); + } break; default: GGML_ABORT("missing cgraph builder"); } @@ -1573,10 +1576,12 @@ struct clip_model_loader { } break; case PROJECTOR_TYPE_QWEN3A: { - model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight")); - model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias")); - model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight")); - model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias")); + model.conv2d_1_w = get_tensor(string_format(TN_CONV2D, 1, "weight")); + model.conv2d_1_b = get_tensor(string_format(TN_CONV2D, 1, "bias")); + model.conv2d_2_w = get_tensor(string_format(TN_CONV2D, 2, "weight")); + model.conv2d_2_b = get_tensor(string_format(TN_CONV2D, 2, "bias")); + model.conv2d_3_w = get_tensor(string_format(TN_CONV2D, 3, "weight")); + model.conv2d_3_b = get_tensor(string_format(TN_CONV2D, 3, "bias")); model.conv_out_w = get_tensor(string_format(TN_CONV_OUT, "weight")); // no bias model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight")); model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias")); @@ -3058,7 +3063,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_VOXTRAL: case PROJECTOR_TYPE_ULTRAVOX: case PROJECTOR_TYPE_QWEN2A: - case PROJECTOR_TYPE_QWEN3A: case PROJECTOR_TYPE_MUSIC_FLAMINGO: { n_patches = img->nx; @@ -3078,6 +3082,10 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im n_patches /= 2; } } break; + case PROJECTOR_TYPE_QWEN3A: + { + return 375; // TODO: calculate this + } break; case PROJECTOR_TYPE_GLMA: { n_patches = img->nx; @@ -3566,6 +3574,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { case PROJECTOR_TYPE_QWEN2A: return ctx->model.mm_fc_w->ne[1]; case PROJECTOR_TYPE_QWEN3A: + return ctx->model.mm_2_w->ne[1] * 4; // 4 for deepstack, TODO: do NOT hardcode case PROJECTOR_TYPE_GLMA: case PROJECTOR_TYPE_LFM2: case PROJECTOR_TYPE_KIMIVL: diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index e08c33f353..266f01f0f9 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -71,3 +71,8 @@ struct clip_graph_glm4v : clip_graph { clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; }; + +struct clip_graph_qwen3a : clip_graph { + clip_graph_qwen3a(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; diff --git a/tools/mtmd/models/qwen3a.cpp b/tools/mtmd/models/qwen3a.cpp new file mode 100644 index 0000000000..2680073290 --- /dev/null +++ b/tools/mtmd/models/qwen3a.cpp @@ -0,0 +1,69 @@ +#include "models.h" + +ggml_cgraph * clip_graph_qwen3a::build() { + ggml_tensor * inp = build_inp_raw(1); + + // conv2d block + // TODO: do we need to split by chunks of n_window each like on transformers impl? + { + inp = ggml_conv_2d(ctx0, model.conv2d_1_w, inp, 2, 2, 1, 1, 1, 1); + inp = ggml_add(ctx0, inp, model.conv2d_1_b); + inp = ggml_gelu_erf(ctx0, inp); + + inp = ggml_conv_2d(ctx0, model.conv2d_2_w, inp, 2, 2, 1, 1, 1, 1); + inp = ggml_add(ctx0, inp, model.conv2d_2_b); + inp = ggml_gelu_erf(ctx0, inp); + + inp = ggml_conv_2d(ctx0, model.conv2d_3_w, inp, 2, 2, 1, 1, 1, 1); + inp = ggml_add(ctx0, inp, model.conv2d_3_b); + inp = ggml_gelu_erf(ctx0, inp); + + // inp is now [time, frames, channels] + cb(inp, "after_conv_blocks", -1); + + inp = ggml_permute(ctx0, inp, 2, 1, 0, 3); // [channels, frames, time] + inp = ggml_cont(ctx0, inp); + inp = ggml_reshape_2d(ctx0, inp, inp->ne[0] * inp->ne[1], inp->ne[2]); // [channels * time, frames] + + // project to n_embd + inp = ggml_mul_mat(ctx0, model.conv_out_w, inp); + if (model.conv_out_b) { + inp = ggml_add(ctx0, inp, model.conv_out_b); + } + cb(inp, "after_conv_out", -1); + } + + auto n_pos = inp->ne[1]; + + ggml_tensor * pos_embd_selected = ggml_view_2d( + ctx0, model.position_embeddings, + model.position_embeddings->ne[0], n_pos, + model.position_embeddings->nb[1], 0 + ); + ggml_tensor * cur = build_vit( + inp, n_pos, + NORM_TYPE_NORMAL, + hparams.ffn_op, + pos_embd_selected, + nullptr); + + cb(cur, "after_transformer", -1); + + // projector + cur = build_ffn(cur, + model.mm_1_w, model.mm_1_b, + nullptr, nullptr, + model.mm_2_w, model.mm_2_b, + FFN_GELU_ERF, + -1); + + cb(cur, "projected", -1); + + // pad deepstack if needed + // TODO: do NOT hard code 3 here + cur = ggml_pad(ctx0, cur, cur->ne[0] * 3, 0, 0, 0); + + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/tools/mtmd/models/whisper-enc.cpp b/tools/mtmd/models/whisper-enc.cpp index a541857797..2f2b127755 100644 --- a/tools/mtmd/models/whisper-enc.cpp +++ b/tools/mtmd/models/whisper-enc.cpp @@ -19,18 +19,9 @@ ggml_cgraph * clip_graph_whisper_enc::build() { cur = ggml_add(ctx0, cur, model.conv1d_2_b); cur = ggml_gelu_erf(ctx0, cur); - // transpose inp = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); cb(inp, "after_conv1d", -1); - - if (model.conv_out_w) { - inp = ggml_mul_mat(ctx0, model.conv_out_w, inp); - if (model.conv_out_b) { - inp = ggml_add(ctx0, inp, model.conv_out_b); - } - cb(inp, "after_conv_out", -1); - } } // sanity check (only check one layer, but it should be the same for all) @@ -86,15 +77,6 @@ ggml_cgraph * clip_graph_whisper_enc::build() { cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur); cur = ggml_add(ctx0, cur, model.mm_fc_b); - } else if (proj_type == PROJECTOR_TYPE_QWEN3A) { - // projector - cur = build_ffn(cur, - model.mm_1_w, model.mm_1_b, - nullptr, nullptr, - model.mm_2_w, model.mm_2_b, - FFN_GELU_ERF, - -1); - } else if (proj_type == PROJECTOR_TYPE_VOXTRAL) { // projector cur = build_ffn(cur, diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index b0b5ab42ab..c18a25b7d3 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -326,6 +326,7 @@ struct mtmd_context { // set preprocessor switch (proj) { case PROJECTOR_TYPE_QWEN2A: + case PROJECTOR_TYPE_QWEN3A: case PROJECTOR_TYPE_QWEN25O: case PROJECTOR_TYPE_ULTRAVOX: case PROJECTOR_TYPE_VOXTRAL: @@ -344,7 +345,7 @@ struct mtmd_context { audio_preproc->initialize(); // set special tokens - if (proj == PROJECTOR_TYPE_QWEN2A) { + if (proj == PROJECTOR_TYPE_QWEN2A || proj == PROJECTOR_TYPE_QWEN3A) { // <|audio_bos|> ... (embeddings) ... <|audio_eos|> aud_beg = "<|audio_bos|>"; aud_end = "<|audio_eos|>";