mtmd: refactor image preprocessing (#21031)
* mtmd: refactor image pre-processing * correct some places * correct lfm2 * fix deepseek-ocr on server * add comment to clarify about mtmd_image_preprocessor_dyn_size
This commit is contained in:
parent
ded446b34c
commit
a73bbd5d92
|
|
@ -5,6 +5,7 @@ find_package(Threads REQUIRED)
|
||||||
add_library(mtmd
|
add_library(mtmd
|
||||||
mtmd.cpp
|
mtmd.cpp
|
||||||
mtmd-audio.cpp
|
mtmd-audio.cpp
|
||||||
|
mtmd-image.cpp
|
||||||
mtmd.h
|
mtmd.h
|
||||||
mtmd-helper.cpp
|
mtmd-helper.cpp
|
||||||
mtmd-helper.h
|
mtmd-helper.h
|
||||||
|
|
|
||||||
|
|
@ -51,7 +51,6 @@
|
||||||
|
|
||||||
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
|
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
|
||||||
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
|
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
|
||||||
#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
|
|
||||||
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
|
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
|
||||||
#define KEY_WIN_ATTN_LAYER_INDEXES "clip.vision.wa_layer_indexes"
|
#define KEY_WIN_ATTN_LAYER_INDEXES "clip.vision.wa_layer_indexes"
|
||||||
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
|
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
|
||||||
|
|
|
||||||
|
|
@ -28,6 +28,13 @@ enum patch_merge_type {
|
||||||
PATCH_MERGE_SPATIAL_UNPAD,
|
PATCH_MERGE_SPATIAL_UNPAD,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum resize_algo {
|
||||||
|
RESIZE_ALGO_BILINEAR, // stretch to target resolution
|
||||||
|
RESIZE_ALGO_BICUBIC, // center-crop when aspect ratio doesn't match
|
||||||
|
RESIZE_ALGO_BICUBIC_PILLOW,
|
||||||
|
// RESIZE_ALGO_LANCZOS, // TODO
|
||||||
|
};
|
||||||
|
|
||||||
struct clip_hparams {
|
struct clip_hparams {
|
||||||
int32_t image_size = 0;
|
int32_t image_size = 0;
|
||||||
int32_t patch_size = 0;
|
int32_t patch_size = 0;
|
||||||
|
|
@ -37,13 +44,26 @@ struct clip_hparams {
|
||||||
int32_t n_head = 0;
|
int32_t n_head = 0;
|
||||||
int32_t n_layer = 0;
|
int32_t n_layer = 0;
|
||||||
// idefics3
|
// idefics3
|
||||||
|
int32_t n_merge = 0; // number of patch merges **per-side**
|
||||||
|
|
||||||
|
// for preprocessor
|
||||||
int32_t image_longest_edge = 0;
|
int32_t image_longest_edge = 0;
|
||||||
int32_t image_min_pixels = -1;
|
int32_t image_min_pixels = -1;
|
||||||
int32_t image_max_pixels = -1;
|
int32_t image_max_pixels = -1;
|
||||||
int32_t n_merge = 0; // number of patch merges **per-side**
|
resize_algo image_resize_algo = RESIZE_ALGO_BICUBIC;
|
||||||
|
bool image_resize_pad = true; // if false, center-crop will be applied when resizing
|
||||||
|
std::array<uint8_t, 3> image_pad_color = {0, 0, 0};
|
||||||
|
|
||||||
|
// (preprocessor) for llava-uhd style models
|
||||||
|
std::vector<clip_image_size> image_res_candidates;
|
||||||
int32_t preproc_min_tiles = 0;
|
int32_t preproc_min_tiles = 0;
|
||||||
int32_t preproc_max_tiles = 0;
|
int32_t preproc_max_tiles = 0;
|
||||||
|
resize_algo image_resize_algo_rf = RESIZE_ALGO_BICUBIC;
|
||||||
|
resize_algo image_resize_algo_ov = RESIZE_ALGO_BILINEAR;
|
||||||
|
bool image_pad_rf = true; // if true, refined image will be padded (e.g. llava-1.6)
|
||||||
|
bool image_pad_ov = false; // if true, overview image will be padded (e.g. llava-1.6)
|
||||||
|
std::array<uint8_t, 3> image_pad_color_rf = {0, 0, 0}; // padding color for refined image
|
||||||
|
std::array<uint8_t, 3> image_pad_color_ov = {0, 0, 0}; // padding color for overview image
|
||||||
|
|
||||||
float image_mean[3];
|
float image_mean[3];
|
||||||
float image_std[3];
|
float image_std[3];
|
||||||
|
|
@ -60,8 +80,6 @@ struct clip_hparams {
|
||||||
float eps = 1e-6;
|
float eps = 1e-6;
|
||||||
float rope_theta = 0.0;
|
float rope_theta = 0.0;
|
||||||
|
|
||||||
std::vector<clip_image_size> image_res_candidates; // for llava-uhd style models
|
|
||||||
int32_t image_crop_resolution;
|
|
||||||
std::unordered_set<int32_t> vision_feature_layer;
|
std::unordered_set<int32_t> vision_feature_layer;
|
||||||
int32_t attn_window_size = 0;
|
int32_t attn_window_size = 0;
|
||||||
int32_t n_wa_pattern = 0;
|
int32_t n_wa_pattern = 0;
|
||||||
|
|
|
||||||
1450
tools/mtmd/clip.cpp
1450
tools/mtmd/clip.cpp
File diff suppressed because it is too large
Load Diff
|
|
@ -97,9 +97,6 @@ struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch
|
||||||
*/
|
*/
|
||||||
void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
|
void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
|
||||||
|
|
||||||
/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
|
|
||||||
bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
|
|
||||||
|
|
||||||
struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
|
struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
|
||||||
|
|
||||||
bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
|
bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,150 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "clip-model.h"
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#define MTMD_INTERNAL_HEADER
|
||||||
|
|
||||||
|
// base class, models must inherit from this class
|
||||||
|
struct mtmd_image_preprocessor {
|
||||||
|
const clip_hparams & hparams;
|
||||||
|
|
||||||
|
mtmd_image_preprocessor(const clip_ctx * ctx): hparams(*clip_get_hparams(ctx)) {}
|
||||||
|
|
||||||
|
virtual ~mtmd_image_preprocessor() = default;
|
||||||
|
virtual bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) = 0;
|
||||||
|
|
||||||
|
void img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]);
|
||||||
|
void img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst);
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* implementation of LLaVA-UHD:
|
||||||
|
* - https://arxiv.org/pdf/2403.11703
|
||||||
|
* - https://github.com/thunlp/LLaVA-UHD
|
||||||
|
* - https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
|
||||||
|
*
|
||||||
|
* overview:
|
||||||
|
* - an image always have a single overview (downscaled image)
|
||||||
|
* - an image can have 0 or multiple slices, depending on the image size
|
||||||
|
* - each slice can then be considered as a separate image
|
||||||
|
*
|
||||||
|
* note: the term "slice" and "tile" are used interchangeably
|
||||||
|
*
|
||||||
|
* for example:
|
||||||
|
*
|
||||||
|
* [overview] --> [slice 1] --> [slice 2]
|
||||||
|
* | |
|
||||||
|
* +--> [slice 3] --> [slice 4]
|
||||||
|
*/
|
||||||
|
struct mtmd_image_preprocessor_llava_uhd : mtmd_image_preprocessor {
|
||||||
|
mtmd_image_preprocessor_llava_uhd(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
||||||
|
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||||
|
|
||||||
|
struct slice_coordinates {
|
||||||
|
int x;
|
||||||
|
int y;
|
||||||
|
clip_image_size size;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct slice_instructions {
|
||||||
|
clip_image_size overview_size; // size of downscaled image
|
||||||
|
clip_image_size refined_size; // size of image right before slicing (must be multiple of slice size)
|
||||||
|
clip_image_size grid_size; // grid_size.width * grid_size.height = number of slices
|
||||||
|
std::vector<slice_coordinates> slices;
|
||||||
|
};
|
||||||
|
|
||||||
|
// LFM2 override this function to implement its custom slicing logic
|
||||||
|
virtual slice_instructions get_slice_instructions(const clip_image_size & original_size);
|
||||||
|
|
||||||
|
std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 & img, const slice_instructions & inst, bool overview_first = true);
|
||||||
|
|
||||||
|
private:
|
||||||
|
clip_image_size get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale = false);
|
||||||
|
|
||||||
|
clip_image_size resize_maintain_aspect_ratio(const clip_image_size & orig, const clip_image_size & target_max);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Selects the best resolution from a list of possible resolutions based on the original size.
|
||||||
|
*
|
||||||
|
* For example, when given a list of resolutions:
|
||||||
|
* - 100x100
|
||||||
|
* - 200x100
|
||||||
|
* - 100x200
|
||||||
|
* - 200x200
|
||||||
|
*
|
||||||
|
* And an input image of size 111x200, then 100x200 is the best fit (least wasted resolution).
|
||||||
|
*
|
||||||
|
* @param original_size The original size of the image
|
||||||
|
* @param possible_resolutions A list of possible resolutions
|
||||||
|
* @return The best fit resolution
|
||||||
|
*/
|
||||||
|
clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector<clip_image_size> & possible_resolutions);
|
||||||
|
int ensure_divide(int length, int patch_size);
|
||||||
|
clip_image_size get_refine_size(const clip_image_size & original_size, const clip_image_size & grid, int scale_resolution, int patch_size, bool allow_upscale = false);
|
||||||
|
clip_image_size get_best_grid(const int max_slice_nums, const int multiple, const float log_ratio);
|
||||||
|
};
|
||||||
|
|
||||||
|
// downscale or upscale the input image to fixed size
|
||||||
|
struct mtmd_image_preprocessor_fixed_size : mtmd_image_preprocessor {
|
||||||
|
mtmd_image_preprocessor_fixed_size(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
||||||
|
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||||
|
};
|
||||||
|
|
||||||
|
// resize image to multiple of patch_size*n_merge, while preserving aspect ratio
|
||||||
|
// if image_resize_pad is true, the resized image will be padded, otherwise it will be either stretched or center-cropped depending on image_resize_pad
|
||||||
|
// this is used by models with native support for dynamic image size, for example: Qwen-VL, Pixtral, Kimi-VL, etc
|
||||||
|
struct mtmd_image_preprocessor_dyn_size : mtmd_image_preprocessor {
|
||||||
|
mtmd_image_preprocessor_dyn_size(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
||||||
|
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||||
|
};
|
||||||
|
|
||||||
|
// similar to mtmd_image_preprocessor_dyn_size, but resize the image to have longest edge equal to hparams.image_longest_edge, while preserving aspect ratio
|
||||||
|
struct mtmd_image_preprocessor_longest_edge : mtmd_image_preprocessor {
|
||||||
|
mtmd_image_preprocessor_longest_edge(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
||||||
|
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||||
|
};
|
||||||
|
|
||||||
|
// custom llava-uhd slicing logic for LFM2
|
||||||
|
// ref: https://github.com/huggingface/transformers/blob/v5.1.0/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py
|
||||||
|
struct mtmd_image_preprocessor_lfm2 : mtmd_image_preprocessor_llava_uhd {
|
||||||
|
// ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json
|
||||||
|
static constexpr int min_tiles = 2;
|
||||||
|
static constexpr int max_tiles = 10;
|
||||||
|
static constexpr float max_pixels_tolerance = 2.0f;
|
||||||
|
static constexpr int tile_size = 512;
|
||||||
|
|
||||||
|
using mtmd_image_preprocessor_llava_uhd::mtmd_image_preprocessor_llava_uhd;
|
||||||
|
slice_instructions get_slice_instructions(const clip_image_size & original_size) override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
clip_image_size find_closest_aspect_ratio(
|
||||||
|
float aspect_ratio,
|
||||||
|
const std::vector<clip_image_size> & target_ratios,
|
||||||
|
int width, int height);
|
||||||
|
std::vector<clip_image_size> get_target_ratios();
|
||||||
|
clip_image_size get_grid_layout(int height, int width);
|
||||||
|
};
|
||||||
|
|
||||||
|
struct mtmd_image_preprocessor_idefics3 : mtmd_image_preprocessor_llava_uhd {
|
||||||
|
mtmd_image_preprocessor_idefics3(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {}
|
||||||
|
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct mtmd_image_preprocessor_internvl : mtmd_image_preprocessor_llava_uhd {
|
||||||
|
mtmd_image_preprocessor_internvl(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {}
|
||||||
|
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct mtmd_image_preprocessor_deepseekocr : mtmd_image_preprocessor {
|
||||||
|
mtmd_image_preprocessor_deepseekocr(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
||||||
|
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct mtmd_image_preprocessor_youtuvl : mtmd_image_preprocessor {
|
||||||
|
mtmd_image_preprocessor_youtuvl(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
||||||
|
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||||
|
};
|
||||||
|
|
@ -2,6 +2,7 @@
|
||||||
#include "clip-impl.h"
|
#include "clip-impl.h"
|
||||||
#include "mtmd.h"
|
#include "mtmd.h"
|
||||||
#include "mtmd-audio.h"
|
#include "mtmd-audio.h"
|
||||||
|
#include "mtmd-image.h"
|
||||||
#include "debug/mtmd-debug.h"
|
#include "debug/mtmd-debug.h"
|
||||||
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
@ -138,7 +139,7 @@ struct mtmd_context {
|
||||||
|
|
||||||
// for llava-uhd style models, we need special tokens in-between slices
|
// for llava-uhd style models, we need special tokens in-between slices
|
||||||
// minicpmv calls them "slices", llama 4 calls them "tiles"
|
// minicpmv calls them "slices", llama 4 calls them "tiles"
|
||||||
mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE;
|
mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE;
|
||||||
std::vector<llama_token> tok_ov_img_start; // overview image
|
std::vector<llama_token> tok_ov_img_start; // overview image
|
||||||
std::vector<llama_token> tok_ov_img_end; // overview image
|
std::vector<llama_token> tok_ov_img_end; // overview image
|
||||||
std::vector<llama_token> tok_slices_start; // start of all slices
|
std::vector<llama_token> tok_slices_start; // start of all slices
|
||||||
|
|
@ -147,13 +148,14 @@ struct mtmd_context {
|
||||||
std::vector<llama_token> tok_sli_img_end; // single slice end
|
std::vector<llama_token> tok_sli_img_end; // single slice end
|
||||||
std::vector<llama_token> tok_sli_img_mid; // between 2 slices
|
std::vector<llama_token> tok_sli_img_mid; // between 2 slices
|
||||||
std::vector<llama_token> tok_row_end; // end of row
|
std::vector<llama_token> tok_row_end; // end of row
|
||||||
bool tok_row_end_trail = false;
|
bool tok_row_end_trail = false;
|
||||||
bool ov_img_first = false;
|
bool ov_img_first = false;
|
||||||
|
|
||||||
// string template for slice image delimiters with row/col (idefics3)
|
// string template for slice image delimiters with row/col (idefics3)
|
||||||
std::string sli_img_start_tmpl;
|
std::string sli_img_start_tmpl;
|
||||||
|
|
||||||
std::unique_ptr<mtmd_audio_preprocessor> audio_preproc;
|
std::unique_ptr<mtmd_audio_preprocessor> audio_preproc;
|
||||||
|
std::unique_ptr<mtmd_image_preprocessor> image_preproc;
|
||||||
|
|
||||||
// TODO @ngxson : add timings
|
// TODO @ngxson : add timings
|
||||||
|
|
||||||
|
|
@ -221,123 +223,193 @@ struct mtmd_context {
|
||||||
|
|
||||||
void init_vision() {
|
void init_vision() {
|
||||||
GGML_ASSERT(ctx_v != nullptr);
|
GGML_ASSERT(ctx_v != nullptr);
|
||||||
|
image_preproc.reset();
|
||||||
|
|
||||||
projector_type proj = clip_get_projector_type(ctx_v);
|
projector_type proj = clip_get_projector_type(ctx_v);
|
||||||
int minicpmv_version = clip_is_minicpmv(ctx_v);
|
|
||||||
if (minicpmv_version == 2) {
|
|
||||||
// minicpmv 2.5 format:
|
|
||||||
// <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
|
|
||||||
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_5;
|
|
||||||
tok_ov_img_start = {lookup_token("<image>")};
|
|
||||||
tok_ov_img_end = {lookup_token("</image>")};
|
|
||||||
tok_slices_start = {lookup_token("<slice>")};
|
|
||||||
tok_slices_end = {lookup_token("</slice>")};
|
|
||||||
tok_sli_img_start = tok_ov_img_start;
|
|
||||||
tok_sli_img_end = tok_ov_img_end;
|
|
||||||
tok_row_end = {lookup_token("\n")};
|
|
||||||
tok_row_end_trail = false; // no trailing end-of-row token
|
|
||||||
ov_img_first = true;
|
|
||||||
|
|
||||||
} else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6 || minicpmv_version == 100045) {
|
switch (proj) {
|
||||||
// minicpmv 2.6 format:
|
case PROJECTOR_TYPE_MLP:
|
||||||
// <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
|
case PROJECTOR_TYPE_MLP_NORM:
|
||||||
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;
|
case PROJECTOR_TYPE_LDP:
|
||||||
tok_ov_img_start = {lookup_token("<image>")};
|
case PROJECTOR_TYPE_LDPV2:
|
||||||
tok_ov_img_end = {lookup_token("</image>")};
|
case PROJECTOR_TYPE_COGVLM:
|
||||||
tok_sli_img_start = {lookup_token("<slice>")};
|
case PROJECTOR_TYPE_JANUS_PRO:
|
||||||
tok_sli_img_end = {lookup_token("</slice>")};
|
case PROJECTOR_TYPE_GLM_EDGE:
|
||||||
tok_row_end = {lookup_token("\n")};
|
{
|
||||||
tok_row_end_trail = false; // no trailing end-of-row token
|
bool has_pinpoints = !clip_get_hparams(ctx_v)->image_res_candidates.empty();
|
||||||
ov_img_first = true;
|
if (has_pinpoints) {
|
||||||
|
image_preproc = std::make_unique<mtmd_image_preprocessor_llava_uhd>(ctx_v);
|
||||||
|
} else {
|
||||||
|
image_preproc = std::make_unique<mtmd_image_preprocessor_fixed_size>(ctx_v);
|
||||||
|
}
|
||||||
|
} break;
|
||||||
|
case PROJECTOR_TYPE_MINICPMV:
|
||||||
|
{
|
||||||
|
int minicpmv_version = clip_is_minicpmv(ctx_v);
|
||||||
|
if (minicpmv_version == 2) {
|
||||||
|
// minicpmv 2.5 format:
|
||||||
|
// <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
|
||||||
|
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_5;
|
||||||
|
tok_ov_img_start = {lookup_token("<image>")};
|
||||||
|
tok_ov_img_end = {lookup_token("</image>")};
|
||||||
|
tok_slices_start = {lookup_token("<slice>")};
|
||||||
|
tok_slices_end = {lookup_token("</slice>")};
|
||||||
|
tok_sli_img_start = tok_ov_img_start;
|
||||||
|
tok_sli_img_end = tok_ov_img_end;
|
||||||
|
tok_row_end = {lookup_token("\n")};
|
||||||
|
tok_row_end_trail = false; // no trailing end-of-row token
|
||||||
|
ov_img_first = true;
|
||||||
|
|
||||||
} else if (minicpmv_version != 0) {
|
} else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6 || minicpmv_version == 100045) {
|
||||||
GGML_ASSERT(false && "unsupported minicpmv version");
|
// minicpmv 2.6 format:
|
||||||
} else if (proj == PROJECTOR_TYPE_LLAMA4) {
|
// <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
|
||||||
// llama 4 format:
|
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;
|
||||||
// <|image_start|>
|
tok_ov_img_start = {lookup_token("<image>")};
|
||||||
// (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|>
|
tok_ov_img_end = {lookup_token("</image>")};
|
||||||
// (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|>
|
tok_sli_img_start = {lookup_token("<slice>")};
|
||||||
// ... <|tile_y_separator|> <-- trailing end-of-row token
|
tok_sli_img_end = {lookup_token("</slice>")};
|
||||||
// <|image|> (overview) <-- overview image is last
|
tok_row_end = {lookup_token("\n")};
|
||||||
// <|image_end|>
|
tok_row_end_trail = false; // no trailing end-of-row token
|
||||||
slice_tmpl = MTMD_SLICE_TMPL_LLAMA4;
|
ov_img_first = true;
|
||||||
tok_ov_img_start = {lookup_token("<|image|>")};
|
|
||||||
tok_sli_img_mid = {lookup_token("<|tile_x_separator|>")};
|
} else if (minicpmv_version != 0) {
|
||||||
tok_row_end = {lookup_token("<|tile_y_separator|>")};
|
throw std::runtime_error(string_format("unsupported minicpmv version: %d\n", minicpmv_version));
|
||||||
tok_row_end_trail = true; // add trailing end-of-row token
|
}
|
||||||
ov_img_first = false; // overview image is last
|
image_preproc = std::make_unique<mtmd_image_preprocessor_llava_uhd>(ctx_v);
|
||||||
|
} break;
|
||||||
|
case PROJECTOR_TYPE_QWEN2VL:
|
||||||
|
case PROJECTOR_TYPE_QWEN25VL:
|
||||||
|
case PROJECTOR_TYPE_QWEN3VL:
|
||||||
|
{
|
||||||
|
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
|
||||||
|
img_beg = "<|vision_start|>";
|
||||||
|
img_end = "<|vision_end|>";
|
||||||
|
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
|
||||||
|
} break;
|
||||||
|
case PROJECTOR_TYPE_YOUTUVL:
|
||||||
|
{
|
||||||
|
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
|
||||||
|
img_beg = "<|vision_start|>";
|
||||||
|
img_end = "<|vision_end|>";
|
||||||
|
image_preproc = std::make_unique<mtmd_image_preprocessor_youtuvl>(ctx_v);
|
||||||
|
} break;
|
||||||
|
case PROJECTOR_TYPE_GEMMA3:
|
||||||
|
case PROJECTOR_TYPE_GEMMA3NV:
|
||||||
|
{
|
||||||
|
// <start_of_image> ... (image embeddings) ... <end_of_image>
|
||||||
|
img_beg = "<start_of_image>";
|
||||||
|
img_end = "<end_of_image>";
|
||||||
|
image_preproc = std::make_unique<mtmd_image_preprocessor_fixed_size>(ctx_v);
|
||||||
|
} break;
|
||||||
|
case PROJECTOR_TYPE_IDEFICS3:
|
||||||
|
{
|
||||||
|
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
|
||||||
|
slice_tmpl = MTMD_SLICE_TMPL_IDEFICS3;
|
||||||
|
tok_ov_img_start = {lookup_token("\n\n"), lookup_token("<fake_token_around_image>"), lookup_token("<global-img>")};
|
||||||
|
tok_ov_img_end = {lookup_token("<fake_token_around_image>")};
|
||||||
|
tok_row_end = {lookup_token("\n")};
|
||||||
|
sli_img_start_tmpl = "<fake_token_around_image><row_%d_col_%d>";
|
||||||
|
image_preproc = std::make_unique<mtmd_image_preprocessor_idefics3>(ctx_v);
|
||||||
|
} break;
|
||||||
|
case PROJECTOR_TYPE_PIXTRAL:
|
||||||
|
{
|
||||||
|
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
|
||||||
|
img_end = "[IMG_END]";
|
||||||
|
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
|
||||||
|
} break;
|
||||||
|
case PROJECTOR_TYPE_PHI4:
|
||||||
|
{
|
||||||
|
// Phi-4 uses media marker insertion only. Keep image boundary text empty.
|
||||||
|
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
|
||||||
|
} break;
|
||||||
|
case PROJECTOR_TYPE_LLAMA4:
|
||||||
|
{
|
||||||
|
// (more details in mtmd_context constructor)
|
||||||
|
img_beg = "<|image_start|>";
|
||||||
|
img_end = "<|image_end|>";
|
||||||
|
LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
|
||||||
|
" https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
|
||||||
|
image_preproc = std::make_unique<mtmd_image_preprocessor_llava_uhd>(ctx_v);
|
||||||
|
} break;
|
||||||
|
case PROJECTOR_TYPE_INTERNVL:
|
||||||
|
{
|
||||||
|
// <img> ... (image embeddings) ... </img>
|
||||||
|
img_beg = "<img>";
|
||||||
|
img_end = "</img>";
|
||||||
|
image_preproc = std::make_unique<mtmd_image_preprocessor_internvl>(ctx_v);
|
||||||
|
} break;
|
||||||
|
case PROJECTOR_TYPE_KIMIVL:
|
||||||
|
{
|
||||||
|
// <|media_start|> ... (image embeddings) ... <|media_end|>
|
||||||
|
img_beg = "<|media_start|>";
|
||||||
|
img_end = "<|media_end|>";
|
||||||
|
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
|
||||||
|
} break;
|
||||||
|
case PROJECTOR_TYPE_KIMIK25:
|
||||||
|
{
|
||||||
|
// <|media_begin|> ... (image embeddings) ... <|media_end|>
|
||||||
|
img_beg = "<|media_begin|>";
|
||||||
|
img_end = "<|media_end|>";
|
||||||
|
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
|
||||||
|
} break;
|
||||||
|
case PROJECTOR_TYPE_LIGHTONOCR:
|
||||||
|
{
|
||||||
|
// <|im_start|> ... (image embeddings) ... <|im_end|>
|
||||||
|
img_beg = "<|im_start|>";
|
||||||
|
img_end = "<|im_end|>";
|
||||||
|
image_preproc = std::make_unique<mtmd_image_preprocessor_longest_edge>(ctx_v);
|
||||||
|
} break;
|
||||||
|
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
|
||||||
|
{
|
||||||
|
image_preproc = std::make_unique<mtmd_image_preprocessor_fixed_size>(ctx_v);
|
||||||
|
} break;
|
||||||
|
case PROJECTOR_TYPE_LFM2:
|
||||||
|
{
|
||||||
|
// multi-tile:
|
||||||
|
// <|image_start|>
|
||||||
|
// <|img_row_1_col_1|> (tile) <|img_row_1_col_2|> (tile) ...
|
||||||
|
// <|img_thumbnail|> (thumbnail)
|
||||||
|
// <|image_end|>
|
||||||
|
// single-tile:
|
||||||
|
// <|image_start|> (image) <|image_end|>
|
||||||
|
img_beg = "<|image_start|>";
|
||||||
|
img_end = "<|image_end|>";
|
||||||
|
slice_tmpl = MTMD_SLICE_TMPL_LFM2;
|
||||||
|
sli_img_start_tmpl = "<|img_row_%d_col_%d|>";
|
||||||
|
tok_ov_img_start = {lookup_token("<|img_thumbnail|>")};
|
||||||
|
ov_img_first = false;
|
||||||
|
image_preproc = std::make_unique<mtmd_image_preprocessor_lfm2>(ctx_v);
|
||||||
|
} break;
|
||||||
|
case PROJECTOR_TYPE_GLM4V:
|
||||||
|
{
|
||||||
|
// <|begin_of_image|> ... (image embeddings) ... <|end_of_image|>
|
||||||
|
img_beg = "<|begin_of_image|>";
|
||||||
|
img_end = "<|end_of_image|>";
|
||||||
|
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
|
||||||
|
} break;
|
||||||
|
case PROJECTOR_TYPE_PADDLEOCR:
|
||||||
|
{
|
||||||
|
// <|IMAGE_START|> ... (image embeddings) ... <|IMAGE_END|>
|
||||||
|
img_beg = "<|IMAGE_START|>";
|
||||||
|
img_end = "<|IMAGE_END|>";
|
||||||
|
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
|
||||||
|
} break;
|
||||||
|
case PROJECTOR_TYPE_DEEPSEEKOCR:
|
||||||
|
{
|
||||||
|
img_end = "\n"; // prevent empty batch on llama-server
|
||||||
|
image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
|
||||||
|
} break;
|
||||||
|
default:
|
||||||
|
throw std::runtime_error(string_format("%s: unexpected vision projector type %d\n", __func__, proj));
|
||||||
}
|
}
|
||||||
|
|
||||||
// set boi/eoi
|
GGML_ASSERT(image_preproc != nullptr);
|
||||||
if (proj == PROJECTOR_TYPE_GEMMA3 || proj == PROJECTOR_TYPE_GEMMA3NV) {
|
|
||||||
// <start_of_image> ... (image embeddings) ... <end_of_image>
|
|
||||||
img_beg = "<start_of_image>";
|
|
||||||
img_end = "<end_of_image>";
|
|
||||||
|
|
||||||
} else if (proj == PROJECTOR_TYPE_IDEFICS3) {
|
|
||||||
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
|
|
||||||
slice_tmpl = MTMD_SLICE_TMPL_IDEFICS3;
|
|
||||||
tok_ov_img_start = {lookup_token("\n\n"), lookup_token("<fake_token_around_image>"), lookup_token("<global-img>")};
|
|
||||||
tok_ov_img_end = {lookup_token("<fake_token_around_image>")};
|
|
||||||
tok_row_end = {lookup_token("\n")};
|
|
||||||
sli_img_start_tmpl = "<fake_token_around_image><row_%d_col_%d>";
|
|
||||||
|
|
||||||
} else if (proj == PROJECTOR_TYPE_PIXTRAL) {
|
|
||||||
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
|
|
||||||
img_end = "[IMG_END]";
|
|
||||||
|
|
||||||
} else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL || proj == PROJECTOR_TYPE_YOUTUVL) {
|
|
||||||
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
|
|
||||||
img_beg = "<|vision_start|>";
|
|
||||||
img_end = "<|vision_end|>";
|
|
||||||
|
|
||||||
} else if (proj == PROJECTOR_TYPE_PHI4) {
|
|
||||||
// Phi-4 uses media marker insertion only. Keep image boundary text empty.
|
|
||||||
|
|
||||||
} else if (proj == PROJECTOR_TYPE_LLAMA4) {
|
|
||||||
// (more details in mtmd_context constructor)
|
|
||||||
img_beg = "<|image_start|>";
|
|
||||||
img_end = "<|image_end|>";
|
|
||||||
LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
|
|
||||||
" https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
|
|
||||||
|
|
||||||
} else if (proj == PROJECTOR_TYPE_INTERNVL) {
|
|
||||||
// <img> ... (image embeddings) ... </img>
|
|
||||||
img_beg = "<img>";
|
|
||||||
img_end = "</img>";
|
|
||||||
|
|
||||||
} else if (proj == PROJECTOR_TYPE_LIGHTONOCR) {
|
|
||||||
// <|im_start|> ... (image embeddings) ... <|im_end|>
|
|
||||||
img_beg = "<|im_start|>";
|
|
||||||
img_end = "<|im_end|>";
|
|
||||||
|
|
||||||
} else if (proj == PROJECTOR_TYPE_LFM2) {
|
|
||||||
// multi-tile:
|
|
||||||
// <|image_start|>
|
|
||||||
// <|img_row_1_col_1|> (tile) <|img_row_1_col_2|> (tile) ...
|
|
||||||
// <|img_thumbnail|> (thumbnail)
|
|
||||||
// <|image_end|>
|
|
||||||
// single-tile:
|
|
||||||
// <|image_start|> (image) <|image_end|>
|
|
||||||
img_beg = "<|image_start|>";
|
|
||||||
img_end = "<|image_end|>";
|
|
||||||
slice_tmpl = MTMD_SLICE_TMPL_LFM2;
|
|
||||||
sli_img_start_tmpl = "<|img_row_%d_col_%d|>";
|
|
||||||
tok_ov_img_start = {lookup_token("<|img_thumbnail|>")};
|
|
||||||
ov_img_first = false;
|
|
||||||
} else if (proj == PROJECTOR_TYPE_GLM4V) {
|
|
||||||
img_beg = "<|begin_of_image|>";
|
|
||||||
img_end = "<|end_of_image|>";
|
|
||||||
|
|
||||||
} else if (proj == PROJECTOR_TYPE_PADDLEOCR) {
|
|
||||||
// <|IMAGE_START|> ... (image embeddings) ... <|IMAGE_END|>
|
|
||||||
img_beg = "<|IMAGE_START|>";
|
|
||||||
img_end = "<|IMAGE_END|>";
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void init_audio() {
|
void init_audio() {
|
||||||
GGML_ASSERT(ctx_a != nullptr);
|
GGML_ASSERT(ctx_a != nullptr);
|
||||||
|
audio_preproc.reset();
|
||||||
|
|
||||||
projector_type proj = clip_get_projector_type(ctx_a);
|
projector_type proj = clip_get_projector_type(ctx_a);
|
||||||
|
|
||||||
LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
|
LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
|
||||||
|
|
@ -347,36 +419,40 @@ struct mtmd_context {
|
||||||
switch (proj) {
|
switch (proj) {
|
||||||
case PROJECTOR_TYPE_QWEN2A:
|
case PROJECTOR_TYPE_QWEN2A:
|
||||||
case PROJECTOR_TYPE_QWEN25O:
|
case PROJECTOR_TYPE_QWEN25O:
|
||||||
case PROJECTOR_TYPE_ULTRAVOX:
|
{
|
||||||
|
// <|audio_bos|> ... (embeddings) ... <|audio_eos|>
|
||||||
|
aud_beg = "<|audio_bos|>";
|
||||||
|
aud_end = "<|audio_eos|>";
|
||||||
|
audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
|
||||||
|
} break;
|
||||||
case PROJECTOR_TYPE_VOXTRAL:
|
case PROJECTOR_TYPE_VOXTRAL:
|
||||||
case PROJECTOR_TYPE_GLMA:
|
{
|
||||||
|
// [BEGIN_AUDIO] ... (embeddings) ...
|
||||||
|
aud_beg = "[BEGIN_AUDIO]";
|
||||||
|
audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
|
||||||
|
} break;
|
||||||
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||||
audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
|
{
|
||||||
break;
|
// <sound> ... (embeddings) ...
|
||||||
|
aud_beg = "<sound>";
|
||||||
|
audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
|
||||||
|
} break;
|
||||||
|
case PROJECTOR_TYPE_ULTRAVOX:
|
||||||
|
case PROJECTOR_TYPE_GLMA:
|
||||||
|
{
|
||||||
|
audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
|
||||||
|
} break;
|
||||||
case PROJECTOR_TYPE_LFM2A:
|
case PROJECTOR_TYPE_LFM2A:
|
||||||
audio_preproc = std::make_unique<mtmd_audio_preprocessor_conformer>(ctx_a);
|
{
|
||||||
break;
|
audio_preproc = std::make_unique<mtmd_audio_preprocessor_conformer>(ctx_a);
|
||||||
|
} break;
|
||||||
default:
|
default:
|
||||||
GGML_ABORT("unsupported audio projector type");
|
throw std::runtime_error(string_format("%s: unexpected audio projector type %d\n", __func__, proj));
|
||||||
}
|
}
|
||||||
|
|
||||||
// initialize audio preprocessor
|
// initialize audio preprocessor
|
||||||
|
GGML_ASSERT(audio_preproc != nullptr);
|
||||||
audio_preproc->initialize();
|
audio_preproc->initialize();
|
||||||
|
|
||||||
// set special tokens
|
|
||||||
if (proj == PROJECTOR_TYPE_QWEN2A) {
|
|
||||||
// <|audio_bos|> ... (embeddings) ... <|audio_eos|>
|
|
||||||
aud_beg = "<|audio_bos|>";
|
|
||||||
aud_end = "<|audio_eos|>";
|
|
||||||
|
|
||||||
} else if (proj == PROJECTOR_TYPE_ULTRAVOX) {
|
|
||||||
// [BEGIN_AUDIO] ... (embeddings) ...
|
|
||||||
aud_beg = "[BEGIN_AUDIO]";
|
|
||||||
|
|
||||||
} else if (proj == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
|
|
||||||
// <sound> ... (embeddings) ...
|
|
||||||
aud_beg = "<sound>";
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// get clip ctx based on chunk type
|
// get clip ctx based on chunk type
|
||||||
|
|
@ -573,8 +649,9 @@ struct mtmd_tokenizer {
|
||||||
std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3);
|
std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3);
|
||||||
|
|
||||||
// preprocess image
|
// preprocess image
|
||||||
|
GGML_ASSERT(ctx->image_preproc != nullptr);
|
||||||
clip_image_f32_batch batch_f32;
|
clip_image_f32_batch batch_f32;
|
||||||
bool ok = clip_image_preprocess(ctx->ctx_v, img_u8.get(), &batch_f32);
|
bool ok = ctx->image_preproc->preprocess(*img_u8, batch_f32);
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
LOG_ERR("Unable to preprocess image\n");
|
LOG_ERR("Unable to preprocess image\n");
|
||||||
return 2;
|
return 2;
|
||||||
|
|
@ -1225,7 +1302,8 @@ void mtmd_debug_preprocess_image(mtmd_context * ctx, const std::vector<uint8_t>
|
||||||
img_u8.ny = ny;
|
img_u8.ny = ny;
|
||||||
img_u8.buf = rgb_values;
|
img_u8.buf = rgb_values;
|
||||||
clip_image_f32_batch batch_f32;
|
clip_image_f32_batch batch_f32;
|
||||||
bool ok = clip_image_preprocess(ctx->ctx_v, &img_u8, &batch_f32);
|
GGML_ASSERT(ctx->image_preproc != nullptr);
|
||||||
|
bool ok = ctx->image_preproc->preprocess(img_u8, batch_f32);
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
LOG_ERR("%s: failed to preprocess image\n", __func__);
|
LOG_ERR("%s: failed to preprocess image\n", __func__);
|
||||||
return;
|
return;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue