#pragma once #include "ggml.h" #include "clip-model.h" #include #include #define MTMD_INTERNAL_HEADER // base class, models must inherit from this class struct mtmd_image_preprocessor { const clip_hparams & hparams; mtmd_image_preprocessor(const clip_ctx * ctx): hparams(*clip_get_hparams(ctx)) {} virtual ~mtmd_image_preprocessor() = default; virtual bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) = 0; void img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]); void img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst); }; /** * implementation of LLaVA-UHD: * - https://arxiv.org/pdf/2403.11703 * - https://github.com/thunlp/LLaVA-UHD * - https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118 * * overview: * - an image always have a single overview (downscaled image) * - an image can have 0 or multiple slices, depending on the image size * - each slice can then be considered as a separate image * * note: the term "slice" and "tile" are used interchangeably * * for example: * * [overview] --> [slice 1] --> [slice 2] * | | * +--> [slice 3] --> [slice 4] */ struct mtmd_image_preprocessor_llava_uhd : mtmd_image_preprocessor { mtmd_image_preprocessor_llava_uhd(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {} bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; struct slice_coordinates { int x; int y; clip_image_size size; }; struct slice_instructions { clip_image_size overview_size; // size of downscaled image clip_image_size refined_size; // size of image right before slicing (must be multiple of slice size) clip_image_size grid_size; // grid_size.width * grid_size.height = number of slices std::vector slices; }; // LFM2 override this function to implement its custom slicing logic virtual slice_instructions get_slice_instructions(const clip_image_size & original_size); std::vector slice_image(const clip_image_u8 & img, const slice_instructions & inst, bool overview_first = true); private: clip_image_size get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale = false); clip_image_size resize_maintain_aspect_ratio(const clip_image_size & orig, const clip_image_size & target_max); /** * Selects the best resolution from a list of possible resolutions based on the original size. * * For example, when given a list of resolutions: * - 100x100 * - 200x100 * - 100x200 * - 200x200 * * And an input image of size 111x200, then 100x200 is the best fit (least wasted resolution). * * @param original_size The original size of the image * @param possible_resolutions A list of possible resolutions * @return The best fit resolution */ clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector & possible_resolutions); int ensure_divide(int length, int patch_size); clip_image_size get_refine_size(const clip_image_size & original_size, const clip_image_size & grid, int scale_resolution, int patch_size, bool allow_upscale = false); clip_image_size get_best_grid(const int max_slice_nums, const int multiple, const float log_ratio); }; // downscale or upscale the input image to fixed size struct mtmd_image_preprocessor_fixed_size : mtmd_image_preprocessor { mtmd_image_preprocessor_fixed_size(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {} bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; }; // resize image to multiple of patch_size*n_merge, while preserving aspect ratio // if image_resize_pad is true, the resized image will be padded, otherwise it will be either stretched or center-cropped depending on image_resize_pad // this is used by models with native support for dynamic image size, for example: Qwen-VL, Pixtral, Kimi-VL, etc struct mtmd_image_preprocessor_dyn_size : mtmd_image_preprocessor { mtmd_image_preprocessor_dyn_size(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {} bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; }; // similar to mtmd_image_preprocessor_dyn_size, but resize the image to have longest edge equal to hparams.image_longest_edge, while preserving aspect ratio struct mtmd_image_preprocessor_longest_edge : mtmd_image_preprocessor { mtmd_image_preprocessor_longest_edge(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {} bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; }; // custom llava-uhd slicing logic for LFM2 // ref: https://github.com/huggingface/transformers/blob/v5.1.0/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py struct mtmd_image_preprocessor_lfm2 : mtmd_image_preprocessor_llava_uhd { // ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json static constexpr int min_tiles = 2; static constexpr int max_tiles = 10; static constexpr float max_pixels_tolerance = 2.0f; static constexpr int tile_size = 512; using mtmd_image_preprocessor_llava_uhd::mtmd_image_preprocessor_llava_uhd; slice_instructions get_slice_instructions(const clip_image_size & original_size) override; private: clip_image_size find_closest_aspect_ratio( float aspect_ratio, const std::vector & target_ratios, int width, int height); std::vector get_target_ratios(); clip_image_size get_grid_layout(int height, int width); }; struct mtmd_image_preprocessor_idefics3 : mtmd_image_preprocessor_llava_uhd { mtmd_image_preprocessor_idefics3(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {} bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; }; struct mtmd_image_preprocessor_internvl : mtmd_image_preprocessor_llava_uhd { mtmd_image_preprocessor_internvl(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {} bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; }; struct mtmd_image_preprocessor_deepseekocr : mtmd_image_preprocessor { mtmd_image_preprocessor_deepseekocr(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {} bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; }; struct mtmd_image_preprocessor_youtuvl : mtmd_image_preprocessor { mtmd_image_preprocessor_youtuvl(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {} bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; };