diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index d559d70631..0ca9f97151 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4936,7 +4936,7 @@ class Qwen3VLVisionModel(MmprojModel): return if name.startswith("visual."): - return [(self.map_tensor_name(name), data_torch)] + yield (self.map_tensor_name(name), data_torch) return [] # skip other tensors @@ -4975,7 +4975,6 @@ class Qwen3OmniMmprojModel(Qwen3VLVisionModel, Qwen25AudioModel): # transform conv2d bias [n_embd] --> [1, 1, n_embd] data_torch = data_torch.unsqueeze(-1).unsqueeze(-1) yield from Qwen25AudioModel.modify_tensors(self, data_torch, name, bid) - return [] @ModelBase.register("Glm4vForConditionalGeneration", "Glm4vMoeForConditionalGeneration", "GlmOcrForConditionalGeneration") @@ -5028,14 +5027,20 @@ class Qwen3VLTextModel(Qwen3Model): class Qwen3VLMoeTextModel(Qwen3MoeModel): model_arch = gguf.MODEL_ARCH.QWEN3VLMOE + def set_vocab(self): + super().set_vocab() + # correct BOS/EOS tokens + with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f: + tokenizer_config = json.load(f) + added_tokens = tokenizer_config.get("added_tokens_decoder", {}) + for token_id, data in added_tokens.items(): + if data.get("content") == "<|im_end|>": + self.gguf_writer.add_bos_token_id(int(token_id)) + self.gguf_writer.add_eos_token_id(int(token_id)) + def set_gguf_parameters(self): super().set_gguf_parameters() - if "thinker_config" in self.hparams: - vision_config = self.hparams["thinker_config"].get("vision_config", {}) - else: - vision_config = self.hparams.get("vision_config", {}) - deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", [])) - self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num) + self.gguf_writer.add_num_deepstack_layers(0) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # Skip vision tensors - they go in the mmproj file diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 3178ea6601..12a298e9a8 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -2669,7 +2669,12 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im } break; case PROJECTOR_TYPE_QWEN3A: { - return 375; // TODO: calculate this + // 3x stride-2 conv2d: each step is floor((n-1)/2)+1 + int n = img->nx; + n = (n - 1) / 2 + 1; + n = (n - 1) / 2 + 1; + n = (n - 1) / 2 + 1; + n_patches = n; } break; case PROJECTOR_TYPE_GLMA: { @@ -3256,7 +3261,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { case PROJECTOR_TYPE_QWEN2A: return ctx->model.mm_fc_w->ne[1]; case PROJECTOR_TYPE_QWEN3A: - return ctx->model.mm_2_w->ne[1] * 4; // 4 for deepstack, TODO: do NOT hardcode + return ctx->model.mm_2_w->ne[1]; case PROJECTOR_TYPE_GLMA: case PROJECTOR_TYPE_LFM2: case PROJECTOR_TYPE_KIMIVL: