From 7bab4a306551072492f97d0c4a179e904332e6bb Mon Sep 17 00:00:00 2001 From: EliteGPT AI Date: Wed, 31 Dec 2025 12:57:45 +1000 Subject: [PATCH] model : add Qwen3-Omni multimodal architecture support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds support for Qwen3-Omni, Alibaba's multimodal LLM that handles text and vision. This enables the main LLM architecture and vision encoder support. Main LLM changes: - Add LLM_ARCH_QWEN3OMNI enum and architecture registration - Add hparams loading for MoE-based architecture (48 layers, 128 experts) - Reuse llm_build_qwen3moe graph builder - Add IMROPE type for multimodal position encoding Vision encoder changes (via mtmd): - Add PROJECTOR_TYPE_QWEN3O with auto-conversion to QWEN3VL for vision - Support different embedding dimensions (vision=8192, audio=2048) - Add separate Q/K/V tensor support in qwen3vl graph builder Tested with Qwen3-Omni-30B-Q8_0.gguf on distributed 5-GPU setup: - 41-44 tokens/sec inference speed - Text and vision inference working Note: Audio encoder support is WIP and will follow in a separate PR. šŸ¤– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- convert_hf_to_gguf.py | 309 ++++++++++++++++++++++++++++++++++ gguf-py/gguf/constants.py | 20 +++ src/llama-arch.cpp | 2 + src/llama-arch.h | 1 + src/llama-model.cpp | 18 +- tools/mtmd/clip-impl.h | 2 + tools/mtmd/clip.cpp | 12 ++ tools/mtmd/models/qwen3vl.cpp | 48 ++++-- tools/mtmd/mtmd.cpp | 16 +- 9 files changed, 409 insertions(+), 19 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index f893b24c75..f3f27654c8 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4537,6 +4537,315 @@ class Qwen3VLMoeTextModel(Qwen3MoeModel): return super().modify_tensors(data_torch, name, bid) +@ModelBase.register("Qwen3OmniMoeForConditionalGeneration") +class Qwen3OmniModel(MmprojModel): + """Qwen3-Omni multimodal model converter for audio + vision encoders. + + Key differences from Qwen2.5-Omni: + - Audio uses conv2d1/conv2d2/conv2d3 (not conv1/conv2) + - Audio has conv_out, ln_post, proj1, proj2 + - Vision has merger_list (deepstack) like Qwen3-VL + - Vision patch_embed is Conv3D (needs 5D→4D tensor splitting) + - Vision has explicit pos_embed.weight + """ + has_vision_encoder = True + has_audio_encoder = True + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # Setup audio config + assert self.hparams_audio is not None + self.hparams_audio["hidden_size"] = self.hparams_audio.get("d_model") + self.hparams_audio["intermediate_size"] = self.hparams_audio.get("encoder_ffn_dim") + self.hparams_audio["num_attention_heads"] = self.hparams_audio.get("encoder_attention_heads") + + # Setup vision config + assert self.hparams_vision is not None + self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads") + self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth") + + # Handle image_size - may need to compute from other params + if "image_size" not in self.hparams_vision or self.hparams_vision["image_size"] is None: + self.hparams_vision["image_size"] = 768 # Default for Qwen3-Omni + + # Track deepstack layers + self.is_deepstack_layers = [False] * int(self.hparams_vision.get("num_hidden_layers", 27) or 27) + for idx in self.hparams_vision.get("deepstack_visual_indexes", []): + self.is_deepstack_layers[idx] = True + + def get_vision_config(self) -> dict[str, Any] | None: + return self.global_config.get("thinker_config", {}).get("vision_config") + + def get_audio_config(self) -> dict[str, Any] | None: + return self.global_config.get("thinker_config", {}).get("audio_config") + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + # Set projector type for Qwen3-Omni + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN3O) + + # Audio parameters + assert self.hparams_audio is not None + self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio.get("num_mel_bins", 128)) + self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-5)) + + # Vision parameters + self.gguf_writer.add_vision_use_gelu(True) # Qwen3-Omni uses GELU + + # Vision attention layernorm eps from text config + text_config = self.global_config.get("thinker_config", {}).get("text_config", {}) + rms_norm_eps = text_config.get("rms_norm_eps", 1e-6) + self.gguf_writer.add_vision_attention_layernorm_eps(rms_norm_eps) + + # Deepstack layers for vision + if any(self.is_deepstack_layers): + self.gguf_writer.add_vision_is_deepstack_layers(self.is_deepstack_layers) + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + """Generate sinusoidal position embeddings for audio encoder.""" + assert self.hparams_audio is not None + max_timescale = 10000 + length = 1500 # Max audio sequence length + channels = self.hparams_audio.get("hidden_size", 1280) + + log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1) + inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2).float()) + scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :] + pos_embd = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1).to(dtype=torch.float32) + + yield ("audio_tower.embed_positions.weight", pos_embd) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + # Keep conv layers in higher precision + if ".conv" in name and ".weight" in name: + return gguf.GGMLQuantizationType.F16 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Strip thinker prefix + if name.startswith("thinker."): + name = name.replace("thinker.", "") + + # Skip text model tensors (handled by text model converter) + if name.startswith("model.") or name.startswith("lm_head") or name.startswith("embed_tokens"): + return [] + + # Skip talker and code2wav (not needed for inference) + if name.startswith("talker.") or name.startswith("code2wav."): + return [] + + # Handle audio tensors + if name.startswith("audio_tower"): + # Strip audio_tower. prefix for processing + audio_name = name.replace("audio_tower.", "") + + # Skip embed_positions - we generate sinusoidal positions in generate_extra_tensors + if audio_name.startswith("embed_positions"): + return [] + + # Handle conv2d1/2/3 - map to a.enc_conv1d.{bid} + for i in [1, 2, 3]: + if audio_name.startswith(f"conv2d{i}."): + suffix = audio_name.split(".", 1)[1] # weight or bias + if suffix == "bias": + data_torch = data_torch.unsqueeze(-1) + new_name = self.format_tensor_name(gguf.MODEL_TENSOR.A_ENC_CONV1D, i - 1, suffix=f".{suffix}") + return [(new_name, data_torch)] + + # Handle conv_out - use a separate conv layer index + if audio_name.startswith("conv_out."): + suffix = audio_name.split(".", 1)[1] + if suffix == "bias": + data_torch = data_torch.unsqueeze(-1) + new_name = self.format_tensor_name(gguf.MODEL_TENSOR.A_ENC_CONV1D, 3, suffix=f".{suffix}") + return [(new_name, data_torch)] + + # Handle ln_post - post normalization + if audio_name.startswith("ln_post."): + suffix = audio_name.split(".", 1)[1] + new_name = self.format_tensor_name(gguf.MODEL_TENSOR.A_POST_NORM, suffix=f".{suffix}") + return [(new_name, data_torch)] + + # Handle proj1/proj2 - audio multimodal projector (use A_MMPROJ which supports bid) + if audio_name.startswith("proj1."): + suffix = audio_name.split(".", 1)[1] + new_name = self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 0, suffix=f".{suffix}") + return [(new_name, data_torch)] + if audio_name.startswith("proj2."): + suffix = audio_name.split(".", 1)[1] + new_name = self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 1, suffix=f".{suffix}") + return [(new_name, data_torch)] + + # Handle encoder layers - transform to Whisper-compatible names and use map_tensor_name + if audio_name.startswith("layers."): + # Qwen3-Omni uses same layer naming as Whisper/Ultravox + # audio_tower.layers.{bid}.self_attn.q_proj -> audio_tower.layers.{bid}.self_attn.q_proj + # Just add back the audio_tower prefix and use map_tensor_name + return [(self.map_tensor_name(name), data_torch)] + + # Fallback for any other audio tensors + logger.warning(f"Unknown audio tensor: {name}") + return [(self.map_tensor_name(name), data_torch)] + + # Handle visual tensors + if name.startswith("visual."): + # Handle merger_list (deepstack) + if name.startswith("visual.merger_list."): + # Format: visual.merger_list.{idx}.{ln_q|mlp}.{layer}.{weight|bias} + parts = name.split(".") + idx = int(parts[2]) # merger_list index + + # Get actual layer index from deepstack_visual_indexes + deepstack_indexes = self.hparams_vision.get("deepstack_visual_indexes", []) + if idx < len(deepstack_indexes): + layer_idx = deepstack_indexes[idx] + else: + layer_idx = idx # Fallback + + suffix_parts = parts[3:] # Everything after the index + suffix = ".".join(suffix_parts) + + if suffix.startswith("ln_q"): + tensor_type = gguf.MODEL_TENSOR.V_DS_NORM + tail = suffix.split(".", 1)[1] if "." in suffix else "weight" + elif suffix.startswith("mlp.0"): + tensor_type = gguf.MODEL_TENSOR.V_DS_FC1 + tail = suffix.split(".", 2)[2] if suffix.count(".") >= 2 else "weight" + elif suffix.startswith("mlp.2"): + tensor_type = gguf.MODEL_TENSOR.V_DS_FC2 + tail = suffix.split(".", 2)[2] if suffix.count(".") >= 2 else "weight" + else: + raise ValueError(f"Unexpected deepstack tensor: {name}") + + new_name = self.format_tensor_name(tensor_type, layer_idx, suffix=f".{tail}") + return [(new_name, data_torch)] + + # Handle main merger + if name.startswith("visual.merger."): + suffix = name.split(".", 2)[2] + if suffix.startswith("mlp.0"): + # First FC layer + tail = suffix.split(".", 2)[2] if suffix.count(".") >= 2 else "weight" + new_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, 0, suffix=f".{tail}") + elif suffix.startswith("mlp.2"): + # Second FC layer + tail = suffix.split(".", 2)[2] if suffix.count(".") >= 2 else "weight" + new_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, 2, suffix=f".{tail}") + elif suffix.startswith("ln_q"): + tail = suffix.split(".", 1)[1] if "." in suffix else "weight" + new_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_POST_NORM, suffix=f".{tail}") + else: + raise ValueError(f"Unexpected merger tensor: {name}") + return [(new_name, data_torch)] + + # Handle QKV split for attention + if ".qkv." in name: + if data_torch.ndim == 2: # weight + c3, _ = data_torch.shape + else: # bias + c3 = data_torch.shape[0] + assert c3 % 3 == 0 + c = c3 // 3 + wq = data_torch[:c] + wk = data_torch[c:c * 2] + wv = data_torch[c * 2:] + return [ + (self.map_tensor_name(name.replace("qkv", "q")), wq), + (self.map_tensor_name(name.replace("qkv", "k")), wk), + (self.map_tensor_name(name.replace("qkv", "v")), wv), + ] + + # Handle patch_embed - Conv3D needs splitting to 4D tensors (GGUF max is 4D) + if name == "visual.patch_embed.proj.weight": + # Split Conv3D into Conv2Ds along temporal dimension + if data_torch.ndim == 5: + c1, c2, kt, kh, kw = data_torch.shape + del c1, c2, kh, kw + if kt != 2: + raise ValueError("Current implementation only supports temporal_patch_size of 2") + return [ + (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight", data_torch[:, :, 0, ...]), + (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...]), + ] + return [(self.map_tensor_name(name), data_torch)] + + if name == "visual.patch_embed.proj.bias": + return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".bias", data_torch)] + + # Default handling for other visual tensors + return [(self.map_tensor_name(name), data_torch)] + + # Fall back to parent for any other tensors + return super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Qwen3OmniMoeForConditionalGeneration") +class Qwen3OmniMoeTextModel(Qwen3MoeModel): + """Qwen3-Omni MoE text model converter. + + Converts the text model (thinker.model.*) from Qwen3-Omni to GGUF format. + The audio and vision encoders are handled by Qwen3OmniModel (mmproj converter). + + Key differences from Qwen3VLMoeTextModel: + - Tensor prefix is thinker.model.* (not model.*) + - Must skip: thinker.audio_tower, thinker.visual, talker, code2wav + - Config structure: thinker_config.text_config (handled by load_hparams) + """ + model_arch = gguf.MODEL_ARCH.QWEN3OMNI + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + # Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-Omni + # The text_config is already merged into hparams by load_hparams + rope_scaling = self.hparams.get("rope_scaling") or self.hparams.get("rope_parameters") or {} + + if rope_scaling.get("mrope_section"): + # mrope_section contains [time, height, width] dimensions + mrope_section = rope_scaling["mrope_section"] + # Pad to 4 dimensions [time, height, width, extra] + while len(mrope_section) < 4: + mrope_section.append(0) + self.gguf_writer.add_rope_dimension_sections(mrope_section[:4]) + logger.info(f"MRoPE sections: {mrope_section[:4]}") + + # Get vision config for deepstack layers (from thinker_config in hparams) + thinker_config = self.hparams.get("thinker_config", {}) + vision_config = thinker_config.get("vision_config", {}) + deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", [])) + if deepstack_layer_num > 0: + self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Skip multimodal encoders - they go in the mmproj file + if name.startswith("thinker.audio_tower."): + return [] + if name.startswith("thinker.visual."): + return [] + + # Skip talker (speech synthesis) and code2wav (audio generation) - not needed for text inference + if name.startswith("talker."): + return [] + if name.startswith("code2wav."): + return [] + + # Strip thinker prefix to get standard tensor names + # Original names: + # thinker.model.layers.* -> model.layers.* + # thinker.model.embed_tokens.* -> model.embed_tokens.* + # thinker.model.norm.* -> model.norm.* + # thinker.lm_head.* -> lm_head.* (NOT model.lm_head!) + if name.startswith("thinker.model."): + name = name.replace("thinker.model.", "model.", 1) + elif name.startswith("thinker."): + # Handle other thinker tensors (lm_head, etc.) - just strip thinker. + name = name.replace("thinker.", "", 1) + + return super().modify_tensors(data_torch, name, bid) + + @ModelBase.register("GPT2LMHeadModel") class GPT2Model(TextModel): model_arch = gguf.MODEL_ARCH.GPT2 diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 616b8add36..8c6cfb907f 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -372,6 +372,7 @@ class MODEL_ARCH(IntEnum): QWEN3NEXT = auto() QWEN3VL = auto() QWEN3VLMOE = auto() + QWEN3OMNI = auto() PHI2 = auto() PHI3 = auto() PHIMOE = auto() @@ -769,6 +770,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.QWEN3NEXT: "qwen3next", MODEL_ARCH.QWEN3VL: "qwen3vl", MODEL_ARCH.QWEN3VLMOE: "qwen3vlmoe", + MODEL_ARCH.QWEN3OMNI: "qwen3omni", MODEL_ARCH.PHI2: "phi2", MODEL_ARCH.PHI3: "phi3", MODEL_ARCH.PHIMOE: "phimoe", @@ -1720,6 +1722,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, ], + MODEL_ARCH.QWEN3OMNI: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + ], MODEL_ARCH.PLAMO: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -3485,6 +3504,7 @@ class VisionProjectorType: QWEN2A = "qwen2a" # audio GLMA = "glma" # audio QWEN25O = "qwen2.5o" # omni + QWEN3O = "qwen3o" # qwen3-omni VOXTRAL = "voxtral" LFM2 = "lfm2" KIMIVL = "kimivl" diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 94a6807eac..d70615ff68 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -37,6 +37,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_QWEN3NEXT, "qwen3next" }, { LLM_ARCH_QWEN3VL, "qwen3vl" }, { LLM_ARCH_QWEN3VLMOE, "qwen3vlmoe" }, + { LLM_ARCH_QWEN3OMNI, "qwen3omni" }, { LLM_ARCH_PHI2, "phi2" }, { LLM_ARCH_PHI3, "phi3" }, { LLM_ARCH_PHIMOE, "phimoe" }, @@ -915,6 +916,7 @@ static std::set llm_get_tensor_names(llm_arch arch) { }; case LLM_ARCH_QWEN3MOE: case LLM_ARCH_QWEN3VLMOE: + case LLM_ARCH_QWEN3OMNI: case LLM_ARCH_OLMOE: case LLM_ARCH_LLADA_MOE: case LLM_ARCH_RND1: diff --git a/src/llama-arch.h b/src/llama-arch.h index 714ead4025..346f2ac2b8 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -41,6 +41,7 @@ enum llm_arch { LLM_ARCH_QWEN3NEXT, LLM_ARCH_QWEN3VL, LLM_ARCH_QWEN3VLMOE, + LLM_ARCH_QWEN3OMNI, LLM_ARCH_PHI2, LLM_ARCH_PHI3, LLM_ARCH_PHIMOE, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 5e664c8c57..6008596638 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1144,6 +1144,16 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_QWEN3OMNI: + { + ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false); + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + switch (hparams.n_layer) { + case 48: type = LLM_TYPE_30B_A3B; break; + default: type = LLM_TYPE_UNKNOWN; + } + } break; case LLM_ARCH_PHI2: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); @@ -3598,6 +3608,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } break; case LLM_ARCH_QWEN3MOE: case LLM_ARCH_QWEN3VLMOE: + case LLM_ARCH_QWEN3OMNI: case LLM_ARCH_RND1: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -7115,7 +7126,7 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp); } - if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) { + if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_QWEN3OMNI || arch == LLM_ARCH_RND1) { LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); } @@ -7510,6 +7521,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { { llm = std::make_unique(*this, params); } break; + case LLM_ARCH_QWEN3OMNI: + { + llm = std::make_unique(*this, params); + } break; case LLM_ARCH_PHI2: { llm = std::make_unique(*this, params); @@ -8081,6 +8096,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { return LLAMA_ROPE_TYPE_MROPE; case LLM_ARCH_QWEN3VL: case LLM_ARCH_QWEN3VLMOE: + case LLM_ARCH_QWEN3OMNI: return LLAMA_ROPE_TYPE_IMROPE; case LLM_ARCH_GLM4: diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index a0939865e3..65231c748f 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -169,6 +169,7 @@ enum projector_type { PROJECTOR_TYPE_GLM_EDGE, PROJECTOR_TYPE_QWEN2VL, PROJECTOR_TYPE_QWEN3VL, + PROJECTOR_TYPE_QWEN3O, // qwen3-omni: converts to QWEN3VL for vision, uses custom encoder for audio PROJECTOR_TYPE_GEMMA3, PROJECTOR_TYPE_IDEFICS3, PROJECTOR_TYPE_PIXTRAL, @@ -199,6 +200,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_QWEN2VL, "qwen2vl_merger"}, { PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"}, { PROJECTOR_TYPE_QWEN3VL, "qwen3vl_merger"}, + { PROJECTOR_TYPE_QWEN3O, "qwen3o"}, { PROJECTOR_TYPE_GEMMA3, "gemma3"}, { PROJECTOR_TYPE_IDEFICS3, "idefics3"}, { PROJECTOR_TYPE_PIXTRAL, "pixtral"}, diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 3ba0823def..c08e7063d8 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -970,6 +970,18 @@ struct clip_model_loader { ? PROJECTOR_TYPE_QWEN25VL : PROJECTOR_TYPE_QWEN2A; } + + // Qwen3-Omni: vision uses qwen3vl pipeline, audio stays qwen3o + if (model.proj_type == PROJECTOR_TYPE_QWEN3O) { + projector_type new_type = modality == CLIP_MODALITY_VISION + ? PROJECTOR_TYPE_QWEN3VL + : PROJECTOR_TYPE_QWEN3O; + LOG_INF("%s: QWEN3O auto-conversion: %s -> %s (modality=%s)\n", __func__, + PROJECTOR_TYPE_NAMES[model.proj_type].c_str(), + PROJECTOR_TYPE_NAMES[new_type].c_str(), + modality == CLIP_MODALITY_VISION ? "vision" : "audio"); + model.proj_type = new_type; + } } const bool is_vision = model.modality == CLIP_MODALITY_VISION; diff --git a/tools/mtmd/models/qwen3vl.cpp b/tools/mtmd/models/qwen3vl.cpp index 35a42cb84d..647738a501 100644 --- a/tools/mtmd/models/qwen3vl.cpp +++ b/tools/mtmd/models/qwen3vl.cpp @@ -85,23 +85,43 @@ ggml_cgraph * clip_graph_qwen3vl::build() { // self-attention { - cur = ggml_mul_mat(ctx0, layer.qkv_w, cur); - cur = ggml_add(ctx0, cur, layer.qkv_b); + // Support both separate Q/K/V (Qwen3-Omni) and combined QKV (Qwen3-VL) + ggml_tensor * Qcur; + ggml_tensor * Kcur; + ggml_tensor * Vcur; - ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, - /* nb1 */ ggml_row_size(cur->type, d_head), - /* nb2 */ cur->nb[1], - /* offset */ 0); + if (layer.qkv_w) { + // Combined QKV format + cur = ggml_mul_mat(ctx0, layer.qkv_w, cur); + cur = ggml_add(ctx0, cur, layer.qkv_b); - ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, - /* nb1 */ ggml_row_size(cur->type, d_head), - /* nb2 */ cur->nb[1], - /* offset */ ggml_row_size(cur->type, n_embd)); + Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, + /* nb1 */ ggml_row_size(cur->type, d_head), + /* nb2 */ cur->nb[1], + /* offset */ 0); - ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, - /* nb1 */ ggml_row_size(cur->type, d_head), - /* nb2 */ cur->nb[1], - /* offset */ ggml_row_size(cur->type, 2 * n_embd)); + Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, + /* nb1 */ ggml_row_size(cur->type, d_head), + /* nb2 */ cur->nb[1], + /* offset */ ggml_row_size(cur->type, n_embd)); + + Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, + /* nb1 */ ggml_row_size(cur->type, d_head), + /* nb2 */ cur->nb[1], + /* offset */ ggml_row_size(cur->type, 2 * n_embd)); + } else { + // Separate Q/K/V format (like Qwen3-Omni) + Qcur = ggml_add(ctx0, + ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b); + Kcur = ggml_add(ctx0, + ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b); + Vcur = ggml_add(ctx0, + ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b); + + Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos); + Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos); + Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos); + } cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index b9c4fa9098..eb559ac19f 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -188,7 +188,15 @@ struct mtmd_context { } // if both vision and audio mmproj are present, we need to validate their n_embd - if (ctx_v && ctx_a) { + // Note: QWEN3O has different embedding dimensions for vision and audio, which is valid + // - Vision uses deepstack, so n_embd_v = n_embd * (1 + n_deepstack_layers) = 8192 + // - Audio doesn't use deepstack, so n_embd_a = projection_dim = 2048 + projector_type proj_v = ctx_v ? clip_get_projector_type(ctx_v) : PROJECTOR_TYPE_UNKNOWN; + projector_type proj_a = ctx_a ? clip_get_projector_type(ctx_a) : PROJECTOR_TYPE_UNKNOWN; + bool is_qwen3o = (proj_v == PROJECTOR_TYPE_QWEN3VL || proj_v == PROJECTOR_TYPE_QWEN3O) && + (proj_a == PROJECTOR_TYPE_QWEN3O); + + if (ctx_v && ctx_a && !is_qwen3o) { int n_embd_v = clip_n_mmproj_embd(ctx_v); int n_embd_a = clip_n_mmproj_embd(ctx_a); if (n_embd_v != n_embd_a) { @@ -198,9 +206,9 @@ struct mtmd_context { } } - // since we already validate n_embd of vision and audio mmproj, - // we can safely assume that they are the same - int n_embd_clip = clip_n_mmproj_embd(ctx_v ? ctx_v : ctx_a); + // For QWEN3O, use vision embedding dimension (includes deepstack) for validation + // For other models, vision and audio should have same embedding dimension + int n_embd_clip = is_qwen3o ? clip_n_mmproj_embd(ctx_v) : clip_n_mmproj_embd(ctx_v ? ctx_v : ctx_a); if (n_embd_text != n_embd_clip) { throw std::runtime_error(string_format( "mismatch between text model (n_embd = %d) and mmproj (n_embd = %d)\n"