diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 5215de5503..7e60cda208 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -7667,8 +7667,7 @@ class DeepseekV2Model(TextModel): raise ValueError(f"Unprocessed experts: {experts}") -@ModelBase.register("VaetkiForCausalLM") -@ModelBase.register("VaetkiVLForCausalLM") +@ModelBase.register("VaetkiForCausalLM", "VaetkiVLForCausalLM") class VaetkiModel(TextModel): """VAETKI MoE model with MLA attention and 4-norm layer structure""" model_arch = gguf.MODEL_ARCH.VAETKI @@ -7922,7 +7921,7 @@ class VaetkiVisionModel(MmprojModel): if "embed_dim" in self.hparams_vision: self.hparams_vision["hidden_size"] = self.hparams_vision.get("embed_dim") if "image_size" not in self.hparams_vision: - self.hparams_vision["image_size"] = self.preprocessor_config.get("size", {}).get("shortest_edge", 560) + self.hparams_vision["image_size"] = 560 # unused, set for compatibility def set_gguf_parameters(self): super().set_gguf_parameters() @@ -7934,6 +7933,11 @@ class VaetkiVisionModel(MmprojModel): self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-5)) self.gguf_writer.add_vision_spatial_merge_size(hparams.get("spatial_merge_size", 2)) + # support dynamic size + image_size = self.preprocessor_config["size"] + self.gguf_writer.add_vision_image_min_pixels(image_size["shortest_edge"]) + self.gguf_writer.add_vision_image_max_pixels(image_size["longest_edge"]) + def tensor_force_quant(self, name, new_name, bid, n_dims): if "class_pos_embd" in new_name: return gguf.GGMLQuantizationType.F32 diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 1501c9a36a..d8db79db19 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -284,6 +284,8 @@ class Keys: class ClipVision: PROJECTOR_TYPE = "clip.vision.projector_type" # for mixed modality models IMAGE_SIZE = "clip.vision.image_size" + IMAGE_MIN_PIXELS = "clip.vision.image_min_pixels" + IMAGE_MAX_PIXELS = "clip.vision.image_max_pixels" PREPROC_IMAGE_SIZE = "clip.vision.preproc_image_size" PATCH_SIZE = "clip.vision.patch_size" EMBEDDING_LENGTH = "clip.vision.embedding_length" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 7fbb78866b..0fe91786aa 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -1113,6 +1113,12 @@ class GGUFWriter: def add_vision_image_size(self, value: int) -> None: self.add_uint32(Keys.ClipVision.IMAGE_SIZE, value) + def add_vision_image_max_pixels(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.IMAGE_MAX_PIXELS, value) + + def add_vision_image_min_pixels(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.IMAGE_MIN_PIXELS, value) + def add_vision_preproc_image_size(self, value: int) -> None: self.add_uint32(Keys.ClipVision.PREPROC_IMAGE_SIZE, value) diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index b835d4db84..74dfa2f05a 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -36,6 +36,8 @@ // vision-specific #define KEY_VISION_PROJ_TYPE "clip.vision.projector_type" // for models with mixed modalities #define KEY_IMAGE_SIZE "clip.vision.image_size" +#define KEY_IMAGE_MIN_PIXELS "clip.vision.image_min_pixels" +#define KEY_IMAGE_MAX_PIXELS "clip.vision.image_max_pixels" #define KEY_PREPROC_IMAGE_SIZE "clip.vision.preproc_image_size" #define KEY_PATCH_SIZE "clip.vision.patch_size" #define KEY_IMAGE_MEAN "clip.vision.image_mean" diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index a9b1c2e02c..5d219d3ab1 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1201,7 +1201,8 @@ struct clip_model_loader { hparams.rope_theta = 10000.0f; hparams.n_merge = 2; get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); - hparams.set_limit_image_tokens(4, 3265); + get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels); + get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels); hparams.set_warmup_n_tokens(40*40); } break; case PROJECTOR_TYPE_LLAMA4: