llama.cpp/tools/mtmd/mtmd-image.h

#pragma once

#include "ggml.h"
#include "clip-model.h"

#include <vector>
#include <string>

#define MTMD_INTERNAL_HEADER

// base class, models must inherit from this class
struct mtmd_image_preprocessor {
    const clip_hparams & hparams;

    mtmd_image_preprocessor(const clip_ctx * ctx): hparams(*clip_get_hparams(ctx)) {}

    virtual ~mtmd_image_preprocessor() = default;
    virtual bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) = 0;

    void img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]);
    void img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst);
};

/**
 * implementation of LLaVA-UHD:
 *  - https://arxiv.org/pdf/2403.11703
 *  - https://github.com/thunlp/LLaVA-UHD
 *  - https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
 *
 * overview:
 *   - an image always have a single overview (downscaled image)
 *   - an image can have 0 or multiple slices, depending on the image size
 *   - each slice can then be considered as a separate image
 *
 * note: the term "slice" and "tile" are used interchangeably
 *
 * for example:
 *
 * [overview] --> [slice 1] --> [slice 2]
 *           |                |
 *           +--> [slice 3] --> [slice 4]
 */
struct mtmd_image_preprocessor_llava_uhd : mtmd_image_preprocessor {
    mtmd_image_preprocessor_llava_uhd(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;

    struct slice_coordinates {
        int x;
        int y;
        clip_image_size size;
    };

    struct slice_instructions {
        clip_image_size overview_size; // size of downscaled image
        clip_image_size refined_size;  // size of image right before slicing (must be multiple of slice size)
        clip_image_size grid_size;     // grid_size.width * grid_size.height = number of slices
        std::vector<slice_coordinates> slices;
    };

    // LFM2 override this function to implement its custom slicing logic
    virtual slice_instructions get_slice_instructions(const clip_image_size & original_size);

    std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 & img, const slice_instructions & inst, bool overview_first = true);

private:
    clip_image_size get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale = false);

    clip_image_size resize_maintain_aspect_ratio(const clip_image_size & orig, const clip_image_size & target_max);

    /**
     * Selects the best resolution from a list of possible resolutions based on the original size.
     *
     * For example, when given a list of resolutions:
     *  - 100x100
     *  - 200x100
     *  - 100x200
     *  - 200x200
     *
     * And an input image of size 111x200, then 100x200 is the best fit (least wasted resolution).
     *
     * @param original_size The original size of the image
     * @param possible_resolutions A list of possible resolutions
     * @return The best fit resolution
     */
    clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector<clip_image_size> & possible_resolutions);
    int ensure_divide(int length, int patch_size);
    clip_image_size get_refine_size(const clip_image_size & original_size, const clip_image_size & grid, int scale_resolution, int patch_size, bool allow_upscale = false);
    clip_image_size get_best_grid(const int max_slice_nums, const int multiple, const float log_ratio);
};

// downscale or upscale the input image to fixed size
struct mtmd_image_preprocessor_fixed_size : mtmd_image_preprocessor {
    mtmd_image_preprocessor_fixed_size(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
};

// resize image to multiple of patch_size*n_merge, while preserving aspect ratio
// if image_resize_pad is true, the resized image will be padded, otherwise it will be either stretched or center-cropped depending on image_resize_pad
// this is used by models with native support for dynamic image size, for example: Qwen-VL, Pixtral, Kimi-VL, etc
struct mtmd_image_preprocessor_dyn_size : mtmd_image_preprocessor {
    mtmd_image_preprocessor_dyn_size(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
};

// similar to mtmd_image_preprocessor_dyn_size, but resize the image to have longest edge equal to hparams.image_longest_edge, while preserving aspect ratio
struct mtmd_image_preprocessor_longest_edge : mtmd_image_preprocessor {
    mtmd_image_preprocessor_longest_edge(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
};

// custom llava-uhd slicing logic for LFM2
// ref: https://github.com/huggingface/transformers/blob/v5.1.0/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py
struct mtmd_image_preprocessor_lfm2 : mtmd_image_preprocessor_llava_uhd {
    // ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json
    static constexpr int   min_tiles            = 2;
    static constexpr int   max_tiles            = 10;
    static constexpr float max_pixels_tolerance = 2.0f;
    static constexpr int   tile_size            = 512;

    using mtmd_image_preprocessor_llava_uhd::mtmd_image_preprocessor_llava_uhd;
    slice_instructions get_slice_instructions(const clip_image_size & original_size) override;

private:
    clip_image_size find_closest_aspect_ratio(
            float aspect_ratio,
            const std::vector<clip_image_size> & target_ratios,
            int width, int height);
    std::vector<clip_image_size> get_target_ratios();
    clip_image_size get_grid_layout(int height, int width);
};

struct mtmd_image_preprocessor_idefics3 : mtmd_image_preprocessor_llava_uhd {
    mtmd_image_preprocessor_idefics3(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {}
    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
};

struct mtmd_image_preprocessor_internvl : mtmd_image_preprocessor_llava_uhd {
    mtmd_image_preprocessor_internvl(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {}
    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
};

struct mtmd_image_preprocessor_deepseekocr : mtmd_image_preprocessor {
    mtmd_image_preprocessor_deepseekocr(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
};

struct mtmd_image_preprocessor_youtuvl : mtmd_image_preprocessor {
    mtmd_image_preprocessor_youtuvl(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
};