From 172865e93c2eddab0809771cd606c0c8897b13a1 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 2 Apr 2026 00:55:44 +0200 Subject: [PATCH] convert ASR model ok --- convert_hf_to_gguf.py | 79 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 73 insertions(+), 6 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 0ca9f97151..d1279668d4 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4825,7 +4825,10 @@ class RND1Model(Qwen2MoeModel): class Qwen3VLVisionModel(MmprojModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - assert self.hparams_vision is not None + if self.hparams_vision is None: + logger.info("No vision config found, skipping vision tensor processing") + return + # Compute image_size if not present if "image_size" not in self.hparams_vision: # For Qwen3VL/Qwen3VLMoe, compute from num_position_embeddings @@ -4946,18 +4949,29 @@ class Qwen3OmniMmprojModel(Qwen3VLVisionModel, Qwen25AudioModel): has_vision_encoder = True def get_vision_config(self) -> dict[str, Any] | None: - return self.global_config["thinker_config"].get("vision_config") + if self.has_vision_encoder: + return self.global_config["thinker_config"].get("vision_config") + else: + return None def get_audio_config(self) -> dict[str, Any] | None: - return self.global_config["thinker_config"].get("audio_config") + if self.has_audio_encoder: + return self.global_config["thinker_config"].get("audio_config") + else: + return None def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.QWEN3VL) - self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.QWEN3A) + if self.has_vision_encoder: + Qwen3VLVisionModel.set_gguf_parameters(self) + self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.QWEN3VL) + if self.has_audio_encoder: + Qwen25AudioModel.set_gguf_parameters(self) + self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.QWEN3A) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if "visual." in name: + if not self.has_vision_encoder: + raise ValueError(f"Model does not have vision encoder, but found tensor {name}") # need to transform vision tensor naming, so that modify_tensors() logic can be used correctly name = name.replace("thinker.visual.", "model.visual.") if ".merger_list." in name: @@ -4971,12 +4985,20 @@ class Qwen3OmniMmprojModel(Qwen3VLVisionModel, Qwen25AudioModel): name = name.replace(".mlp.2", ".linear_fc2") yield from Qwen3VLVisionModel.modify_tensors(self, data_torch, name, bid) elif "audio_tower." in name: + if not self.has_audio_encoder: + raise ValueError(f"Model does not have audio encoder, but found tensor {name}") if "conv2d" in name and name.endswith(".bias"): # transform conv2d bias [n_embd] --> [1, 1, n_embd] data_torch = data_torch.unsqueeze(-1).unsqueeze(-1) yield from Qwen25AudioModel.modify_tensors(self, data_torch, name, bid) +@ModelBase.register("Qwen3ASRForConditionalGeneration") +class Qwen3ASRMmprojModel(Qwen3OmniMmprojModel): + has_audio_encoder = True + has_vision_encoder = False + + @ModelBase.register("Glm4vForConditionalGeneration", "Glm4vMoeForConditionalGeneration", "GlmOcrForConditionalGeneration") class Glm4VVisionModel(Qwen3VLVisionModel): def set_gguf_parameters(self): @@ -5023,6 +5045,31 @@ class Qwen3VLTextModel(Qwen3Model): yield from super().modify_tensors(data_torch, name, bid) +@ModelBase.register("Qwen3ASRForConditionalGeneration") +class Qwen3ASRTextModel(Qwen3VLTextModel): + model_arch = gguf.MODEL_ARCH.QWEN3VL + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_num_deepstack_layers(0) + + def set_vocab(self): + super().set_vocab() + # fix chat template, use correct chatml format + self.gguf_writer.add_chat_template("{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}") + + def modify_tensors(self, data_torch, name, bid): + # qwen3-omni + name = name.replace("thinker.", "") + + # Skip vision and audio tensors - they go in the mmproj file + if "visual." in name or "audio_tower." in name \ + or "talker." in name or "code2wav." in name: + return + + yield from super().modify_tensors(data_torch, name, bid) + + @ModelBase.register("Qwen3VLMoeForConditionalGeneration", "Qwen3OmniMoeForConditionalGeneration") class Qwen3VLMoeTextModel(Qwen3MoeModel): model_arch = gguf.MODEL_ARCH.QWEN3VLMOE @@ -5083,6 +5130,26 @@ class Qwen3VLMoeTextModel(Qwen3MoeModel): yield from super().modify_tensors(data_torch, name, bid) +@ModelBase.register("Qwen3OmniMoeForConditionalGeneration") +class Qwen3OmniMoeTextModel(Qwen3VLMoeTextModel): + model_arch = gguf.MODEL_ARCH.QWEN3VLMOE + + def set_vocab(self): + super().set_vocab() + # correct BOS/EOS tokens + with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f: + tokenizer_config = json.load(f) + added_tokens = tokenizer_config.get("added_tokens_decoder", {}) + for token_id, data in added_tokens.items(): + if data.get("content") == "<|im_end|>": + self.gguf_writer.add_bos_token_id(int(token_id)) + self.gguf_writer.add_eos_token_id(int(token_id)) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_num_deepstack_layers(0) + + class _LinearAttentionVReorderBase(Qwen3NextModel): model_arch = gguf.MODEL_ARCH.QWEN3NEXT # overridden by subclasses """reorders V heads from grouped to tiled order for ggml broadcast