mtmd: refactor image preprocessing (#21031)
* mtmd: refactor image pre-processing * correct some places * correct lfm2 * fix deepseek-ocr on server * add comment to clarify about mtmd_image_preprocessor_dyn_size
This commit is contained in:
parent
ded446b34c
commit
a73bbd5d92
|
|
@ -5,6 +5,7 @@ find_package(Threads REQUIRED)
|
|||
add_library(mtmd
|
||||
mtmd.cpp
|
||||
mtmd-audio.cpp
|
||||
mtmd-image.cpp
|
||||
mtmd.h
|
||||
mtmd-helper.cpp
|
||||
mtmd-helper.h
|
||||
|
|
|
|||
|
|
@ -51,7 +51,6 @@
|
|||
|
||||
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
|
||||
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
|
||||
#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
|
||||
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
|
||||
#define KEY_WIN_ATTN_LAYER_INDEXES "clip.vision.wa_layer_indexes"
|
||||
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
|
||||
|
|
|
|||
|
|
@ -28,6 +28,13 @@ enum patch_merge_type {
|
|||
PATCH_MERGE_SPATIAL_UNPAD,
|
||||
};
|
||||
|
||||
enum resize_algo {
|
||||
RESIZE_ALGO_BILINEAR, // stretch to target resolution
|
||||
RESIZE_ALGO_BICUBIC, // center-crop when aspect ratio doesn't match
|
||||
RESIZE_ALGO_BICUBIC_PILLOW,
|
||||
// RESIZE_ALGO_LANCZOS, // TODO
|
||||
};
|
||||
|
||||
struct clip_hparams {
|
||||
int32_t image_size = 0;
|
||||
int32_t patch_size = 0;
|
||||
|
|
@ -37,13 +44,26 @@ struct clip_hparams {
|
|||
int32_t n_head = 0;
|
||||
int32_t n_layer = 0;
|
||||
// idefics3
|
||||
int32_t n_merge = 0; // number of patch merges **per-side**
|
||||
|
||||
// for preprocessor
|
||||
int32_t image_longest_edge = 0;
|
||||
int32_t image_min_pixels = -1;
|
||||
int32_t image_max_pixels = -1;
|
||||
int32_t n_merge = 0; // number of patch merges **per-side**
|
||||
resize_algo image_resize_algo = RESIZE_ALGO_BICUBIC;
|
||||
bool image_resize_pad = true; // if false, center-crop will be applied when resizing
|
||||
std::array<uint8_t, 3> image_pad_color = {0, 0, 0};
|
||||
|
||||
// (preprocessor) for llava-uhd style models
|
||||
std::vector<clip_image_size> image_res_candidates;
|
||||
int32_t preproc_min_tiles = 0;
|
||||
int32_t preproc_max_tiles = 0;
|
||||
resize_algo image_resize_algo_rf = RESIZE_ALGO_BICUBIC;
|
||||
resize_algo image_resize_algo_ov = RESIZE_ALGO_BILINEAR;
|
||||
bool image_pad_rf = true; // if true, refined image will be padded (e.g. llava-1.6)
|
||||
bool image_pad_ov = false; // if true, overview image will be padded (e.g. llava-1.6)
|
||||
std::array<uint8_t, 3> image_pad_color_rf = {0, 0, 0}; // padding color for refined image
|
||||
std::array<uint8_t, 3> image_pad_color_ov = {0, 0, 0}; // padding color for overview image
|
||||
|
||||
float image_mean[3];
|
||||
float image_std[3];
|
||||
|
|
@ -60,8 +80,6 @@ struct clip_hparams {
|
|||
float eps = 1e-6;
|
||||
float rope_theta = 0.0;
|
||||
|
||||
std::vector<clip_image_size> image_res_candidates; // for llava-uhd style models
|
||||
int32_t image_crop_resolution;
|
||||
std::unordered_set<int32_t> vision_feature_layer;
|
||||
int32_t attn_window_size = 0;
|
||||
int32_t n_wa_pattern = 0;
|
||||
|
|
|
|||
1450
tools/mtmd/clip.cpp
1450
tools/mtmd/clip.cpp
File diff suppressed because it is too large
Load Diff
|
|
@ -97,9 +97,6 @@ struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch
|
|||
*/
|
||||
void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
|
||||
|
||||
/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
|
||||
bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
|
||||
|
||||
struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
|
||||
|
||||
bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,150 @@
|
|||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "clip-model.h"
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#define MTMD_INTERNAL_HEADER
|
||||
|
||||
// base class, models must inherit from this class
|
||||
struct mtmd_image_preprocessor {
|
||||
const clip_hparams & hparams;
|
||||
|
||||
mtmd_image_preprocessor(const clip_ctx * ctx): hparams(*clip_get_hparams(ctx)) {}
|
||||
|
||||
virtual ~mtmd_image_preprocessor() = default;
|
||||
virtual bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) = 0;
|
||||
|
||||
void img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]);
|
||||
void img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst);
|
||||
};
|
||||
|
||||
/**
|
||||
* implementation of LLaVA-UHD:
|
||||
* - https://arxiv.org/pdf/2403.11703
|
||||
* - https://github.com/thunlp/LLaVA-UHD
|
||||
* - https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
|
||||
*
|
||||
* overview:
|
||||
* - an image always have a single overview (downscaled image)
|
||||
* - an image can have 0 or multiple slices, depending on the image size
|
||||
* - each slice can then be considered as a separate image
|
||||
*
|
||||
* note: the term "slice" and "tile" are used interchangeably
|
||||
*
|
||||
* for example:
|
||||
*
|
||||
* [overview] --> [slice 1] --> [slice 2]
|
||||
* | |
|
||||
* +--> [slice 3] --> [slice 4]
|
||||
*/
|
||||
struct mtmd_image_preprocessor_llava_uhd : mtmd_image_preprocessor {
|
||||
mtmd_image_preprocessor_llava_uhd(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
||||
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||
|
||||
struct slice_coordinates {
|
||||
int x;
|
||||
int y;
|
||||
clip_image_size size;
|
||||
};
|
||||
|
||||
struct slice_instructions {
|
||||
clip_image_size overview_size; // size of downscaled image
|
||||
clip_image_size refined_size; // size of image right before slicing (must be multiple of slice size)
|
||||
clip_image_size grid_size; // grid_size.width * grid_size.height = number of slices
|
||||
std::vector<slice_coordinates> slices;
|
||||
};
|
||||
|
||||
// LFM2 override this function to implement its custom slicing logic
|
||||
virtual slice_instructions get_slice_instructions(const clip_image_size & original_size);
|
||||
|
||||
std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 & img, const slice_instructions & inst, bool overview_first = true);
|
||||
|
||||
private:
|
||||
clip_image_size get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale = false);
|
||||
|
||||
clip_image_size resize_maintain_aspect_ratio(const clip_image_size & orig, const clip_image_size & target_max);
|
||||
|
||||
/**
|
||||
* Selects the best resolution from a list of possible resolutions based on the original size.
|
||||
*
|
||||
* For example, when given a list of resolutions:
|
||||
* - 100x100
|
||||
* - 200x100
|
||||
* - 100x200
|
||||
* - 200x200
|
||||
*
|
||||
* And an input image of size 111x200, then 100x200 is the best fit (least wasted resolution).
|
||||
*
|
||||
* @param original_size The original size of the image
|
||||
* @param possible_resolutions A list of possible resolutions
|
||||
* @return The best fit resolution
|
||||
*/
|
||||
clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector<clip_image_size> & possible_resolutions);
|
||||
int ensure_divide(int length, int patch_size);
|
||||
clip_image_size get_refine_size(const clip_image_size & original_size, const clip_image_size & grid, int scale_resolution, int patch_size, bool allow_upscale = false);
|
||||
clip_image_size get_best_grid(const int max_slice_nums, const int multiple, const float log_ratio);
|
||||
};
|
||||
|
||||
// downscale or upscale the input image to fixed size
|
||||
struct mtmd_image_preprocessor_fixed_size : mtmd_image_preprocessor {
|
||||
mtmd_image_preprocessor_fixed_size(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
||||
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||
};
|
||||
|
||||
// resize image to multiple of patch_size*n_merge, while preserving aspect ratio
|
||||
// if image_resize_pad is true, the resized image will be padded, otherwise it will be either stretched or center-cropped depending on image_resize_pad
|
||||
// this is used by models with native support for dynamic image size, for example: Qwen-VL, Pixtral, Kimi-VL, etc
|
||||
struct mtmd_image_preprocessor_dyn_size : mtmd_image_preprocessor {
|
||||
mtmd_image_preprocessor_dyn_size(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
||||
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||
};
|
||||
|
||||
// similar to mtmd_image_preprocessor_dyn_size, but resize the image to have longest edge equal to hparams.image_longest_edge, while preserving aspect ratio
|
||||
struct mtmd_image_preprocessor_longest_edge : mtmd_image_preprocessor {
|
||||
mtmd_image_preprocessor_longest_edge(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
||||
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||
};
|
||||
|
||||
// custom llava-uhd slicing logic for LFM2
|
||||
// ref: https://github.com/huggingface/transformers/blob/v5.1.0/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py
|
||||
struct mtmd_image_preprocessor_lfm2 : mtmd_image_preprocessor_llava_uhd {
|
||||
// ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json
|
||||
static constexpr int min_tiles = 2;
|
||||
static constexpr int max_tiles = 10;
|
||||
static constexpr float max_pixels_tolerance = 2.0f;
|
||||
static constexpr int tile_size = 512;
|
||||
|
||||
using mtmd_image_preprocessor_llava_uhd::mtmd_image_preprocessor_llava_uhd;
|
||||
slice_instructions get_slice_instructions(const clip_image_size & original_size) override;
|
||||
|
||||
private:
|
||||
clip_image_size find_closest_aspect_ratio(
|
||||
float aspect_ratio,
|
||||
const std::vector<clip_image_size> & target_ratios,
|
||||
int width, int height);
|
||||
std::vector<clip_image_size> get_target_ratios();
|
||||
clip_image_size get_grid_layout(int height, int width);
|
||||
};
|
||||
|
||||
struct mtmd_image_preprocessor_idefics3 : mtmd_image_preprocessor_llava_uhd {
|
||||
mtmd_image_preprocessor_idefics3(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {}
|
||||
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||
};
|
||||
|
||||
struct mtmd_image_preprocessor_internvl : mtmd_image_preprocessor_llava_uhd {
|
||||
mtmd_image_preprocessor_internvl(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {}
|
||||
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||
};
|
||||
|
||||
struct mtmd_image_preprocessor_deepseekocr : mtmd_image_preprocessor {
|
||||
mtmd_image_preprocessor_deepseekocr(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
||||
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||
};
|
||||
|
||||
struct mtmd_image_preprocessor_youtuvl : mtmd_image_preprocessor {
|
||||
mtmd_image_preprocessor_youtuvl(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
||||
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||
};
|
||||
|
|
@ -2,6 +2,7 @@
|
|||
#include "clip-impl.h"
|
||||
#include "mtmd.h"
|
||||
#include "mtmd-audio.h"
|
||||
#include "mtmd-image.h"
|
||||
#include "debug/mtmd-debug.h"
|
||||
|
||||
#include "llama.h"
|
||||
|
|
@ -138,7 +139,7 @@ struct mtmd_context {
|
|||
|
||||
// for llava-uhd style models, we need special tokens in-between slices
|
||||
// minicpmv calls them "slices", llama 4 calls them "tiles"
|
||||
mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE;
|
||||
mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE;
|
||||
std::vector<llama_token> tok_ov_img_start; // overview image
|
||||
std::vector<llama_token> tok_ov_img_end; // overview image
|
||||
std::vector<llama_token> tok_slices_start; // start of all slices
|
||||
|
|
@ -147,13 +148,14 @@ struct mtmd_context {
|
|||
std::vector<llama_token> tok_sli_img_end; // single slice end
|
||||
std::vector<llama_token> tok_sli_img_mid; // between 2 slices
|
||||
std::vector<llama_token> tok_row_end; // end of row
|
||||
bool tok_row_end_trail = false;
|
||||
bool ov_img_first = false;
|
||||
bool tok_row_end_trail = false;
|
||||
bool ov_img_first = false;
|
||||
|
||||
// string template for slice image delimiters with row/col (idefics3)
|
||||
std::string sli_img_start_tmpl;
|
||||
|
||||
std::unique_ptr<mtmd_audio_preprocessor> audio_preproc;
|
||||
std::unique_ptr<mtmd_image_preprocessor> image_preproc;
|
||||
|
||||
// TODO @ngxson : add timings
|
||||
|
||||
|
|
@ -221,123 +223,193 @@ struct mtmd_context {
|
|||
|
||||
void init_vision() {
|
||||
GGML_ASSERT(ctx_v != nullptr);
|
||||
image_preproc.reset();
|
||||
|
||||
projector_type proj = clip_get_projector_type(ctx_v);
|
||||
int minicpmv_version = clip_is_minicpmv(ctx_v);
|
||||
if (minicpmv_version == 2) {
|
||||
// minicpmv 2.5 format:
|
||||
// <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
|
||||
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_5;
|
||||
tok_ov_img_start = {lookup_token("<image>")};
|
||||
tok_ov_img_end = {lookup_token("</image>")};
|
||||
tok_slices_start = {lookup_token("<slice>")};
|
||||
tok_slices_end = {lookup_token("</slice>")};
|
||||
tok_sli_img_start = tok_ov_img_start;
|
||||
tok_sli_img_end = tok_ov_img_end;
|
||||
tok_row_end = {lookup_token("\n")};
|
||||
tok_row_end_trail = false; // no trailing end-of-row token
|
||||
ov_img_first = true;
|
||||
|
||||
} else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6 || minicpmv_version == 100045) {
|
||||
// minicpmv 2.6 format:
|
||||
// <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
|
||||
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;
|
||||
tok_ov_img_start = {lookup_token("<image>")};
|
||||
tok_ov_img_end = {lookup_token("</image>")};
|
||||
tok_sli_img_start = {lookup_token("<slice>")};
|
||||
tok_sli_img_end = {lookup_token("</slice>")};
|
||||
tok_row_end = {lookup_token("\n")};
|
||||
tok_row_end_trail = false; // no trailing end-of-row token
|
||||
ov_img_first = true;
|
||||
switch (proj) {
|
||||
case PROJECTOR_TYPE_MLP:
|
||||
case PROJECTOR_TYPE_MLP_NORM:
|
||||
case PROJECTOR_TYPE_LDP:
|
||||
case PROJECTOR_TYPE_LDPV2:
|
||||
case PROJECTOR_TYPE_COGVLM:
|
||||
case PROJECTOR_TYPE_JANUS_PRO:
|
||||
case PROJECTOR_TYPE_GLM_EDGE:
|
||||
{
|
||||
bool has_pinpoints = !clip_get_hparams(ctx_v)->image_res_candidates.empty();
|
||||
if (has_pinpoints) {
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_llava_uhd>(ctx_v);
|
||||
} else {
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_fixed_size>(ctx_v);
|
||||
}
|
||||
} break;
|
||||
case PROJECTOR_TYPE_MINICPMV:
|
||||
{
|
||||
int minicpmv_version = clip_is_minicpmv(ctx_v);
|
||||
if (minicpmv_version == 2) {
|
||||
// minicpmv 2.5 format:
|
||||
// <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
|
||||
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_5;
|
||||
tok_ov_img_start = {lookup_token("<image>")};
|
||||
tok_ov_img_end = {lookup_token("</image>")};
|
||||
tok_slices_start = {lookup_token("<slice>")};
|
||||
tok_slices_end = {lookup_token("</slice>")};
|
||||
tok_sli_img_start = tok_ov_img_start;
|
||||
tok_sli_img_end = tok_ov_img_end;
|
||||
tok_row_end = {lookup_token("\n")};
|
||||
tok_row_end_trail = false; // no trailing end-of-row token
|
||||
ov_img_first = true;
|
||||
|
||||
} else if (minicpmv_version != 0) {
|
||||
GGML_ASSERT(false && "unsupported minicpmv version");
|
||||
} else if (proj == PROJECTOR_TYPE_LLAMA4) {
|
||||
// llama 4 format:
|
||||
// <|image_start|>
|
||||
// (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|>
|
||||
// (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|>
|
||||
// ... <|tile_y_separator|> <-- trailing end-of-row token
|
||||
// <|image|> (overview) <-- overview image is last
|
||||
// <|image_end|>
|
||||
slice_tmpl = MTMD_SLICE_TMPL_LLAMA4;
|
||||
tok_ov_img_start = {lookup_token("<|image|>")};
|
||||
tok_sli_img_mid = {lookup_token("<|tile_x_separator|>")};
|
||||
tok_row_end = {lookup_token("<|tile_y_separator|>")};
|
||||
tok_row_end_trail = true; // add trailing end-of-row token
|
||||
ov_img_first = false; // overview image is last
|
||||
} else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6 || minicpmv_version == 100045) {
|
||||
// minicpmv 2.6 format:
|
||||
// <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
|
||||
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;
|
||||
tok_ov_img_start = {lookup_token("<image>")};
|
||||
tok_ov_img_end = {lookup_token("</image>")};
|
||||
tok_sli_img_start = {lookup_token("<slice>")};
|
||||
tok_sli_img_end = {lookup_token("</slice>")};
|
||||
tok_row_end = {lookup_token("\n")};
|
||||
tok_row_end_trail = false; // no trailing end-of-row token
|
||||
ov_img_first = true;
|
||||
|
||||
} else if (minicpmv_version != 0) {
|
||||
throw std::runtime_error(string_format("unsupported minicpmv version: %d\n", minicpmv_version));
|
||||
}
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_llava_uhd>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
{
|
||||
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
|
||||
img_beg = "<|vision_start|>";
|
||||
img_end = "<|vision_end|>";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
{
|
||||
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
|
||||
img_beg = "<|vision_start|>";
|
||||
img_end = "<|vision_end|>";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_youtuvl>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3:
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
{
|
||||
// <start_of_image> ... (image embeddings) ... <end_of_image>
|
||||
img_beg = "<start_of_image>";
|
||||
img_end = "<end_of_image>";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_fixed_size>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
{
|
||||
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
|
||||
slice_tmpl = MTMD_SLICE_TMPL_IDEFICS3;
|
||||
tok_ov_img_start = {lookup_token("\n\n"), lookup_token("<fake_token_around_image>"), lookup_token("<global-img>")};
|
||||
tok_ov_img_end = {lookup_token("<fake_token_around_image>")};
|
||||
tok_row_end = {lookup_token("\n")};
|
||||
sli_img_start_tmpl = "<fake_token_around_image><row_%d_col_%d>";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_idefics3>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PIXTRAL:
|
||||
{
|
||||
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
|
||||
img_end = "[IMG_END]";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PHI4:
|
||||
{
|
||||
// Phi-4 uses media marker insertion only. Keep image boundary text empty.
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LLAMA4:
|
||||
{
|
||||
// (more details in mtmd_context constructor)
|
||||
img_beg = "<|image_start|>";
|
||||
img_end = "<|image_end|>";
|
||||
LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
|
||||
" https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_llava_uhd>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_INTERNVL:
|
||||
{
|
||||
// <img> ... (image embeddings) ... </img>
|
||||
img_beg = "<img>";
|
||||
img_end = "</img>";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_internvl>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_KIMIVL:
|
||||
{
|
||||
// <|media_start|> ... (image embeddings) ... <|media_end|>
|
||||
img_beg = "<|media_start|>";
|
||||
img_end = "<|media_end|>";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_KIMIK25:
|
||||
{
|
||||
// <|media_begin|> ... (image embeddings) ... <|media_end|>
|
||||
img_beg = "<|media_begin|>";
|
||||
img_end = "<|media_end|>";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LIGHTONOCR:
|
||||
{
|
||||
// <|im_start|> ... (image embeddings) ... <|im_end|>
|
||||
img_beg = "<|im_start|>";
|
||||
img_end = "<|im_end|>";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_longest_edge>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
|
||||
{
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_fixed_size>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
{
|
||||
// multi-tile:
|
||||
// <|image_start|>
|
||||
// <|img_row_1_col_1|> (tile) <|img_row_1_col_2|> (tile) ...
|
||||
// <|img_thumbnail|> (thumbnail)
|
||||
// <|image_end|>
|
||||
// single-tile:
|
||||
// <|image_start|> (image) <|image_end|>
|
||||
img_beg = "<|image_start|>";
|
||||
img_end = "<|image_end|>";
|
||||
slice_tmpl = MTMD_SLICE_TMPL_LFM2;
|
||||
sli_img_start_tmpl = "<|img_row_%d_col_%d|>";
|
||||
tok_ov_img_start = {lookup_token("<|img_thumbnail|>")};
|
||||
ov_img_first = false;
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_lfm2>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
{
|
||||
// <|begin_of_image|> ... (image embeddings) ... <|end_of_image|>
|
||||
img_beg = "<|begin_of_image|>";
|
||||
img_end = "<|end_of_image|>";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
{
|
||||
// <|IMAGE_START|> ... (image embeddings) ... <|IMAGE_END|>
|
||||
img_beg = "<|IMAGE_START|>";
|
||||
img_end = "<|IMAGE_END|>";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_DEEPSEEKOCR:
|
||||
{
|
||||
img_end = "\n"; // prevent empty batch on llama-server
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
|
||||
} break;
|
||||
default:
|
||||
throw std::runtime_error(string_format("%s: unexpected vision projector type %d\n", __func__, proj));
|
||||
}
|
||||
|
||||
// set boi/eoi
|
||||
if (proj == PROJECTOR_TYPE_GEMMA3 || proj == PROJECTOR_TYPE_GEMMA3NV) {
|
||||
// <start_of_image> ... (image embeddings) ... <end_of_image>
|
||||
img_beg = "<start_of_image>";
|
||||
img_end = "<end_of_image>";
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_IDEFICS3) {
|
||||
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
|
||||
slice_tmpl = MTMD_SLICE_TMPL_IDEFICS3;
|
||||
tok_ov_img_start = {lookup_token("\n\n"), lookup_token("<fake_token_around_image>"), lookup_token("<global-img>")};
|
||||
tok_ov_img_end = {lookup_token("<fake_token_around_image>")};
|
||||
tok_row_end = {lookup_token("\n")};
|
||||
sli_img_start_tmpl = "<fake_token_around_image><row_%d_col_%d>";
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_PIXTRAL) {
|
||||
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
|
||||
img_end = "[IMG_END]";
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL || proj == PROJECTOR_TYPE_YOUTUVL) {
|
||||
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
|
||||
img_beg = "<|vision_start|>";
|
||||
img_end = "<|vision_end|>";
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_PHI4) {
|
||||
// Phi-4 uses media marker insertion only. Keep image boundary text empty.
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_LLAMA4) {
|
||||
// (more details in mtmd_context constructor)
|
||||
img_beg = "<|image_start|>";
|
||||
img_end = "<|image_end|>";
|
||||
LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
|
||||
" https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_INTERNVL) {
|
||||
// <img> ... (image embeddings) ... </img>
|
||||
img_beg = "<img>";
|
||||
img_end = "</img>";
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_LIGHTONOCR) {
|
||||
// <|im_start|> ... (image embeddings) ... <|im_end|>
|
||||
img_beg = "<|im_start|>";
|
||||
img_end = "<|im_end|>";
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_LFM2) {
|
||||
// multi-tile:
|
||||
// <|image_start|>
|
||||
// <|img_row_1_col_1|> (tile) <|img_row_1_col_2|> (tile) ...
|
||||
// <|img_thumbnail|> (thumbnail)
|
||||
// <|image_end|>
|
||||
// single-tile:
|
||||
// <|image_start|> (image) <|image_end|>
|
||||
img_beg = "<|image_start|>";
|
||||
img_end = "<|image_end|>";
|
||||
slice_tmpl = MTMD_SLICE_TMPL_LFM2;
|
||||
sli_img_start_tmpl = "<|img_row_%d_col_%d|>";
|
||||
tok_ov_img_start = {lookup_token("<|img_thumbnail|>")};
|
||||
ov_img_first = false;
|
||||
} else if (proj == PROJECTOR_TYPE_GLM4V) {
|
||||
img_beg = "<|begin_of_image|>";
|
||||
img_end = "<|end_of_image|>";
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_PADDLEOCR) {
|
||||
// <|IMAGE_START|> ... (image embeddings) ... <|IMAGE_END|>
|
||||
img_beg = "<|IMAGE_START|>";
|
||||
img_end = "<|IMAGE_END|>";
|
||||
}
|
||||
GGML_ASSERT(image_preproc != nullptr);
|
||||
}
|
||||
|
||||
void init_audio() {
|
||||
GGML_ASSERT(ctx_a != nullptr);
|
||||
audio_preproc.reset();
|
||||
|
||||
projector_type proj = clip_get_projector_type(ctx_a);
|
||||
|
||||
LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
|
||||
|
|
@ -347,36 +419,40 @@ struct mtmd_context {
|
|||
switch (proj) {
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
case PROJECTOR_TYPE_QWEN25O:
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
{
|
||||
// <|audio_bos|> ... (embeddings) ... <|audio_eos|>
|
||||
aud_beg = "<|audio_bos|>";
|
||||
aud_end = "<|audio_eos|>";
|
||||
audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
case PROJECTOR_TYPE_GLMA:
|
||||
{
|
||||
// [BEGIN_AUDIO] ... (embeddings) ...
|
||||
aud_beg = "[BEGIN_AUDIO]";
|
||||
audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||
audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
|
||||
break;
|
||||
{
|
||||
// <sound> ... (embeddings) ...
|
||||
aud_beg = "<sound>";
|
||||
audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
case PROJECTOR_TYPE_GLMA:
|
||||
{
|
||||
audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LFM2A:
|
||||
audio_preproc = std::make_unique<mtmd_audio_preprocessor_conformer>(ctx_a);
|
||||
break;
|
||||
{
|
||||
audio_preproc = std::make_unique<mtmd_audio_preprocessor_conformer>(ctx_a);
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("unsupported audio projector type");
|
||||
throw std::runtime_error(string_format("%s: unexpected audio projector type %d\n", __func__, proj));
|
||||
}
|
||||
|
||||
// initialize audio preprocessor
|
||||
GGML_ASSERT(audio_preproc != nullptr);
|
||||
audio_preproc->initialize();
|
||||
|
||||
// set special tokens
|
||||
if (proj == PROJECTOR_TYPE_QWEN2A) {
|
||||
// <|audio_bos|> ... (embeddings) ... <|audio_eos|>
|
||||
aud_beg = "<|audio_bos|>";
|
||||
aud_end = "<|audio_eos|>";
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_ULTRAVOX) {
|
||||
// [BEGIN_AUDIO] ... (embeddings) ...
|
||||
aud_beg = "[BEGIN_AUDIO]";
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
|
||||
// <sound> ... (embeddings) ...
|
||||
aud_beg = "<sound>";
|
||||
}
|
||||
}
|
||||
|
||||
// get clip ctx based on chunk type
|
||||
|
|
@ -573,8 +649,9 @@ struct mtmd_tokenizer {
|
|||
std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3);
|
||||
|
||||
// preprocess image
|
||||
GGML_ASSERT(ctx->image_preproc != nullptr);
|
||||
clip_image_f32_batch batch_f32;
|
||||
bool ok = clip_image_preprocess(ctx->ctx_v, img_u8.get(), &batch_f32);
|
||||
bool ok = ctx->image_preproc->preprocess(*img_u8, batch_f32);
|
||||
if (!ok) {
|
||||
LOG_ERR("Unable to preprocess image\n");
|
||||
return 2;
|
||||
|
|
@ -1225,7 +1302,8 @@ void mtmd_debug_preprocess_image(mtmd_context * ctx, const std::vector<uint8_t>
|
|||
img_u8.ny = ny;
|
||||
img_u8.buf = rgb_values;
|
||||
clip_image_f32_batch batch_f32;
|
||||
bool ok = clip_image_preprocess(ctx->ctx_v, &img_u8, &batch_f32);
|
||||
GGML_ASSERT(ctx->image_preproc != nullptr);
|
||||
bool ok = ctx->image_preproc->preprocess(img_u8, batch_f32);
|
||||
if (!ok) {
|
||||
LOG_ERR("%s: failed to preprocess image\n", __func__);
|
||||
return;
|
||||
|
|
|
|||
Loading…
Reference in New Issue