151 lines
6.9 KiB
C++
151 lines
6.9 KiB
C++
#pragma once
|
|
|
|
#include "ggml.h"
|
|
#include "clip-model.h"
|
|
|
|
#include <vector>
|
|
#include <string>
|
|
|
|
#define MTMD_INTERNAL_HEADER
|
|
|
|
// base class, models must inherit from this class
|
|
struct mtmd_image_preprocessor {
|
|
const clip_hparams & hparams;
|
|
|
|
mtmd_image_preprocessor(const clip_ctx * ctx): hparams(*clip_get_hparams(ctx)) {}
|
|
|
|
virtual ~mtmd_image_preprocessor() = default;
|
|
virtual bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) = 0;
|
|
|
|
void img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]);
|
|
void img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst);
|
|
};
|
|
|
|
/**
|
|
* implementation of LLaVA-UHD:
|
|
* - https://arxiv.org/pdf/2403.11703
|
|
* - https://github.com/thunlp/LLaVA-UHD
|
|
* - https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
|
|
*
|
|
* overview:
|
|
* - an image always have a single overview (downscaled image)
|
|
* - an image can have 0 or multiple slices, depending on the image size
|
|
* - each slice can then be considered as a separate image
|
|
*
|
|
* note: the term "slice" and "tile" are used interchangeably
|
|
*
|
|
* for example:
|
|
*
|
|
* [overview] --> [slice 1] --> [slice 2]
|
|
* | |
|
|
* +--> [slice 3] --> [slice 4]
|
|
*/
|
|
struct mtmd_image_preprocessor_llava_uhd : mtmd_image_preprocessor {
|
|
mtmd_image_preprocessor_llava_uhd(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
|
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
|
|
|
struct slice_coordinates {
|
|
int x;
|
|
int y;
|
|
clip_image_size size;
|
|
};
|
|
|
|
struct slice_instructions {
|
|
clip_image_size overview_size; // size of downscaled image
|
|
clip_image_size refined_size; // size of image right before slicing (must be multiple of slice size)
|
|
clip_image_size grid_size; // grid_size.width * grid_size.height = number of slices
|
|
std::vector<slice_coordinates> slices;
|
|
};
|
|
|
|
// LFM2 override this function to implement its custom slicing logic
|
|
virtual slice_instructions get_slice_instructions(const clip_image_size & original_size);
|
|
|
|
std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 & img, const slice_instructions & inst, bool overview_first = true);
|
|
|
|
private:
|
|
clip_image_size get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale = false);
|
|
|
|
clip_image_size resize_maintain_aspect_ratio(const clip_image_size & orig, const clip_image_size & target_max);
|
|
|
|
/**
|
|
* Selects the best resolution from a list of possible resolutions based on the original size.
|
|
*
|
|
* For example, when given a list of resolutions:
|
|
* - 100x100
|
|
* - 200x100
|
|
* - 100x200
|
|
* - 200x200
|
|
*
|
|
* And an input image of size 111x200, then 100x200 is the best fit (least wasted resolution).
|
|
*
|
|
* @param original_size The original size of the image
|
|
* @param possible_resolutions A list of possible resolutions
|
|
* @return The best fit resolution
|
|
*/
|
|
clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector<clip_image_size> & possible_resolutions);
|
|
int ensure_divide(int length, int patch_size);
|
|
clip_image_size get_refine_size(const clip_image_size & original_size, const clip_image_size & grid, int scale_resolution, int patch_size, bool allow_upscale = false);
|
|
clip_image_size get_best_grid(const int max_slice_nums, const int multiple, const float log_ratio);
|
|
};
|
|
|
|
// downscale or upscale the input image to fixed size
|
|
struct mtmd_image_preprocessor_fixed_size : mtmd_image_preprocessor {
|
|
mtmd_image_preprocessor_fixed_size(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
|
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
|
};
|
|
|
|
// resize image to multiple of patch_size*n_merge, while preserving aspect ratio
|
|
// if image_resize_pad is true, the resized image will be padded, otherwise it will be either stretched or center-cropped depending on image_resize_pad
|
|
// this is used by models with native support for dynamic image size, for example: Qwen-VL, Pixtral, Kimi-VL, etc
|
|
struct mtmd_image_preprocessor_dyn_size : mtmd_image_preprocessor {
|
|
mtmd_image_preprocessor_dyn_size(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
|
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
|
};
|
|
|
|
// similar to mtmd_image_preprocessor_dyn_size, but resize the image to have longest edge equal to hparams.image_longest_edge, while preserving aspect ratio
|
|
struct mtmd_image_preprocessor_longest_edge : mtmd_image_preprocessor {
|
|
mtmd_image_preprocessor_longest_edge(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
|
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
|
};
|
|
|
|
// custom llava-uhd slicing logic for LFM2
|
|
// ref: https://github.com/huggingface/transformers/blob/v5.1.0/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py
|
|
struct mtmd_image_preprocessor_lfm2 : mtmd_image_preprocessor_llava_uhd {
|
|
// ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json
|
|
static constexpr int min_tiles = 2;
|
|
static constexpr int max_tiles = 10;
|
|
static constexpr float max_pixels_tolerance = 2.0f;
|
|
static constexpr int tile_size = 512;
|
|
|
|
using mtmd_image_preprocessor_llava_uhd::mtmd_image_preprocessor_llava_uhd;
|
|
slice_instructions get_slice_instructions(const clip_image_size & original_size) override;
|
|
|
|
private:
|
|
clip_image_size find_closest_aspect_ratio(
|
|
float aspect_ratio,
|
|
const std::vector<clip_image_size> & target_ratios,
|
|
int width, int height);
|
|
std::vector<clip_image_size> get_target_ratios();
|
|
clip_image_size get_grid_layout(int height, int width);
|
|
};
|
|
|
|
struct mtmd_image_preprocessor_idefics3 : mtmd_image_preprocessor_llava_uhd {
|
|
mtmd_image_preprocessor_idefics3(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {}
|
|
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
|
};
|
|
|
|
struct mtmd_image_preprocessor_internvl : mtmd_image_preprocessor_llava_uhd {
|
|
mtmd_image_preprocessor_internvl(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {}
|
|
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
|
};
|
|
|
|
struct mtmd_image_preprocessor_deepseekocr : mtmd_image_preprocessor {
|
|
mtmd_image_preprocessor_deepseekocr(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
|
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
|
};
|
|
|
|
struct mtmd_image_preprocessor_youtuvl : mtmd_image_preprocessor {
|
|
mtmd_image_preprocessor_youtuvl(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
|
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
|
};
|