diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index dba190b480..0cd47645d3 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4273,6 +4273,16 @@ class Qwen25OmniModel(Qwen2VLVisionModel): @ModelBase.register("InternVisionModel") class InternVisionModel(MmprojModel): + + min_dynamic_tiles: int = 0 + max_dynamic_tiles: int = 0 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + self.min_dynamic_tiles = self.global_config.get("min_dynamic_patch", 0) + self.max_dynamic_tiles = self.global_config.get("max_dynamic_patch", 0) + def set_gguf_parameters(self): assert self.hparams_vision is not None if isinstance(self.hparams_vision['image_size'], list): @@ -4295,6 +4305,11 @@ class InternVisionModel(MmprojModel): downsample_ratio = self.global_config.get("downsample_ratio") assert downsample_ratio is not None self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio)) + # older models may not have min/max_dynamic_patch in config + if self.min_dynamic_tiles > 0: + self.gguf_writer.add_vision_preproc_min_tiles(self.min_dynamic_tiles) + if self.max_dynamic_tiles > 0: + self.gguf_writer.add_vision_preproc_max_tiles(self.max_dynamic_tiles) def tensor_force_quant(self, name, new_name, bid, n_dims): if ".position_embd." in new_name: diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index c5f92c7700..9383644abf 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -301,6 +301,8 @@ class Keys: IMAGE_SIZE = "clip.vision.image_size" IMAGE_MIN_PIXELS = "clip.vision.image_min_pixels" IMAGE_MAX_PIXELS = "clip.vision.image_max_pixels" + PREPROC_MIN_TILES = "clip.vision.preproc_min_tiles" + PREPROC_MAX_TILES = "clip.vision.preproc_max_tiles" PREPROC_IMAGE_SIZE = "clip.vision.preproc_image_size" PATCH_SIZE = "clip.vision.patch_size" EMBEDDING_LENGTH = "clip.vision.embedding_length" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 5f653d386d..010dfeea1c 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -1156,6 +1156,12 @@ class GGUFWriter: def add_vision_min_pixels(self, value: int) -> None: self.add_uint32(Keys.ClipVision.IMAGE_MIN_PIXELS, value) + def add_vision_preproc_max_tiles(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.PREPROC_MAX_TILES, value) + + def add_vision_preproc_min_tiles(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.PREPROC_MIN_TILES, value) + def add_vision_preproc_image_size(self, value: int) -> None: self.add_uint32(Keys.ClipVision.PREPROC_IMAGE_SIZE, value) diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 3eb66f9145..bf55cec7ef 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -38,6 +38,8 @@ #define KEY_IMAGE_SIZE "clip.vision.image_size" #define KEY_IMAGE_MIN_PIXELS "clip.vision.image_min_pixels" #define KEY_IMAGE_MAX_PIXELS "clip.vision.image_max_pixels" +#define KEY_PREPROC_MIN_TILES "clip.vision.preproc_min_tiles" +#define KEY_PREPROC_MAX_TILES "clip.vision.preproc_max_tiles" #define KEY_PREPROC_IMAGE_SIZE "clip.vision.preproc_image_size" #define KEY_PATCH_SIZE "clip.vision.patch_size" #define KEY_IMAGE_MEAN "clip.vision.image_mean" diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index eeb8da58e0..265a17130f 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -42,6 +42,9 @@ struct clip_hparams { int32_t image_max_pixels = -1; int32_t n_merge = 0; // number of patch merges **per-side** + int32_t preproc_min_tiles = 0; + int32_t preproc_max_tiles = 0; + float image_mean[3]; float image_std[3]; diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 5fcc7c5b59..a47f1f495d 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1138,6 +1138,16 @@ struct clip_model_loader { } } break; case PROJECTOR_TYPE_INTERNVL: + { + // older version of internvl doesn't have min/max tiles, we need to provide default values for them to avoid issues + hparams.preproc_min_tiles = 1; + hparams.preproc_max_tiles = 12; + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); + get_u32(KEY_PREPROC_MIN_TILES, hparams.preproc_min_tiles, false); + get_u32(KEY_PREPROC_MAX_TILES, hparams.preproc_max_tiles, false); + GGML_ASSERT(hparams.preproc_min_tiles <= hparams.preproc_max_tiles && hparams.preproc_max_tiles < INT32_MAX); + set_internvl_dhr_res_candidates(model); + } break; case PROJECTOR_TYPE_NEMOTRON_V2_VL: { get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); @@ -2188,6 +2198,27 @@ struct clip_model_loader { } } } + + static void set_internvl_dhr_res_candidates(clip_model & model) { + auto & hparams = model.hparams; + int min_num = hparams.preproc_min_tiles; + int max_num = hparams.preproc_max_tiles; + if (min_num < 1) { + return; // avoid divide by 0 + } + for (int a = min_num; a <= max_num; ++a) { + int b_lo = (min_num + a - 1) / a; + int b_hi = max_num / a; + b_lo = std::max(b_lo, min_num); + b_hi = std::min(b_hi, max_num); + for (int b = b_lo; b <= b_hi; ++b) { + hparams.image_res_candidates.push_back(clip_image_size { + a*hparams.image_size, + b*hparams.image_size, + }); + } + } + } }; struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_params) { @@ -2734,17 +2765,22 @@ struct llava_uhd { return res; } - static std::vector slice_image(const clip_image_u8 * img, const slice_instructions & inst) { + static std::vector slice_image(const clip_image_u8 * img, const slice_instructions & inst, bool overview_first = true) { std::vector output; // resize to overview size clip_image_u8_ptr resized_img(clip_image_u8_init()); img_tool::resize(*img, *resized_img, inst.overview_size, inst.interpolation_overview, inst.padding_overview, inst.pad_color_overview); - output.push_back(std::move(resized_img)); + if (overview_first) { + output.push_back(std::move(resized_img)); + } if (inst.slices.empty()) { // no slices, just return the resized image + if (!overview_first) { + output.push_back(std::move(resized_img)); + } return output; } @@ -2765,6 +2801,10 @@ struct llava_uhd { output.push_back(std::move(img_slice)); } + if (!overview_first) { + output.push_back(std::move(resized_img)); + } + return output; } @@ -3149,10 +3189,20 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str res_imgs->grid_x = instructions.grid_size.width; res_imgs->grid_y = instructions.grid_size.height; } break; + case PROJECTOR_TYPE_INTERNVL: // support dynamic high-resolution + { + GGML_ASSERT(!params.image_res_candidates.empty()); + auto const inst = llava_uhd::get_slice_instructions(ctx, original_size); + std::vector imgs = llava_uhd::slice_image(img, inst, false); + for (size_t i = 0; i < imgs.size(); ++i) { + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(res)); + } + } break; case PROJECTOR_TYPE_GLM_EDGE: case PROJECTOR_TYPE_GEMMA3: - case PROJECTOR_TYPE_INTERNVL: // TODO @ngxson : support dynamic resolution case PROJECTOR_TYPE_NEMOTRON_V2_VL: { clip_image_u8 resized_image; diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index f66c07345e..456ce7b73c 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -851,13 +851,15 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__); return 1; } + auto proj_type = clip_get_projector_type(ctx_clip); int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip); ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd); bool ok = false; if (clip_is_llava(ctx_clip) || clip_is_minicpmv(ctx_clip) - || clip_is_glm(ctx_clip)) { + || clip_is_glm(ctx_clip) + || proj_type == PROJECTOR_TYPE_INTERNVL) { // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode() const auto & entries = image_tokens->batch_f32.entries; for (size_t i = 0; i < entries.size(); i++) {