diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index fd910cf996..26cc306184 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -284,8 +284,8 @@ class Keys: class ClipVision: PROJECTOR_TYPE = "clip.vision.projector_type" # for mixed modality models IMAGE_SIZE = "clip.vision.image_size" - MAX_PIXELS = "clip.vision.max_pixels" - MIN_PIXELS = "clip.vision.min_pixels" + IMAGE_MIN_PIXELS = "clip.vision.image_min_pixels" + IMAGE_MAX_PIXELS = "clip.vision.image_max_pixels" PREPROC_IMAGE_SIZE = "clip.vision.preproc_image_size" PATCH_SIZE = "clip.vision.patch_size" EMBEDDING_LENGTH = "clip.vision.embedding_length" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 76f0a9c3d2..39cb03191e 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -1099,11 +1099,11 @@ class GGUFWriter: self.add_uint32(Keys.ClipVision.EMBEDDING_LENGTH, value) def add_vision_max_pixels(self, value: int) -> None: - self.add_uint32(Keys.ClipVision.MAX_PIXELS, value) + self.add_uint32(Keys.ClipVision.IMAGE_MAX_PIXELS, value) def add_vision_min_pixels(self, value: int) -> None: - self.add_uint32(Keys.ClipVision.MIN_PIXELS, value) - + self.add_uint32(Keys.ClipVision.IMAGE_MIN_PIXELS, value) + def add_vision_feed_forward_length(self, value: int) -> None: self.add_uint32(Keys.ClipVision.FEED_FORWARD_LENGTH, value) diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index a977fe5e9e..5258ae841b 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -36,6 +36,8 @@ // vision-specific #define KEY_VISION_PROJ_TYPE "clip.vision.projector_type" // for models with mixed modalities #define KEY_IMAGE_SIZE "clip.vision.image_size" +#define KEY_IMAGE_MIN_PIXELS "clip.vision.image_min_pixels" +#define KEY_IMAGE_MAX_PIXELS "clip.vision.image_max_pixels" #define KEY_PREPROC_IMAGE_SIZE "clip.vision.preproc_image_size" #define KEY_PATCH_SIZE "clip.vision.patch_size" #define KEY_IMAGE_MEAN "clip.vision.image_mean" diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 930b2061f1..a8b09a7827 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1230,6 +1230,9 @@ struct clip_model_loader { case PROJECTOR_TYPE_PADDLEOCR: { hparams.n_merge = 2; + get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels); + get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels); + // TODO(megemini): paddleocr vl not specified? hparams.set_limit_image_tokens(8, 4096); hparams.set_warmup_n_tokens(28*28); // avoid OOM on warmup