diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index 71ad149ad3..b3cf15f9ec 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -5,6 +5,7 @@ find_package(Threads REQUIRED) add_library(mtmd mtmd.cpp mtmd-audio.cpp + mtmd-image.cpp mtmd.h mtmd-helper.cpp mtmd-helper.h diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 4bf34e65bc..011d76bcf6 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -51,7 +51,6 @@ #define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" #define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" -#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution" #define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern" #define KEY_WIN_ATTN_LAYER_INDEXES "clip.vision.wa_layer_indexes" #define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size" diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index e9c454fe69..a73e9ba38b 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -28,6 +28,13 @@ enum patch_merge_type { PATCH_MERGE_SPATIAL_UNPAD, }; +enum resize_algo { + RESIZE_ALGO_BILINEAR, // stretch to target resolution + RESIZE_ALGO_BICUBIC, // center-crop when aspect ratio doesn't match + RESIZE_ALGO_BICUBIC_PILLOW, + // RESIZE_ALGO_LANCZOS, // TODO +}; + struct clip_hparams { int32_t image_size = 0; int32_t patch_size = 0; @@ -37,13 +44,26 @@ struct clip_hparams { int32_t n_head = 0; int32_t n_layer = 0; // idefics3 + int32_t n_merge = 0; // number of patch merges **per-side** + + // for preprocessor int32_t image_longest_edge = 0; int32_t image_min_pixels = -1; int32_t image_max_pixels = -1; - int32_t n_merge = 0; // number of patch merges **per-side** + resize_algo image_resize_algo = RESIZE_ALGO_BICUBIC; + bool image_resize_pad = true; // if false, center-crop will be applied when resizing + std::array image_pad_color = {0, 0, 0}; + // (preprocessor) for llava-uhd style models + std::vector image_res_candidates; int32_t preproc_min_tiles = 0; int32_t preproc_max_tiles = 0; + resize_algo image_resize_algo_rf = RESIZE_ALGO_BICUBIC; + resize_algo image_resize_algo_ov = RESIZE_ALGO_BILINEAR; + bool image_pad_rf = true; // if true, refined image will be padded (e.g. llava-1.6) + bool image_pad_ov = false; // if true, overview image will be padded (e.g. llava-1.6) + std::array image_pad_color_rf = {0, 0, 0}; // padding color for refined image + std::array image_pad_color_ov = {0, 0, 0}; // padding color for overview image float image_mean[3]; float image_std[3]; @@ -60,8 +80,6 @@ struct clip_hparams { float eps = 1e-6; float rope_theta = 0.0; - std::vector image_res_candidates; // for llava-uhd style models - int32_t image_crop_resolution; std::unordered_set vision_feature_layer; int32_t attn_window_size = 0; int32_t n_wa_pattern = 0; diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index b7237d6616..fd1cb0dfea 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1029,7 +1029,6 @@ struct clip_model_loader { if (is_vision) { get_u32(KEY_IMAGE_SIZE, hparams.image_size); get_u32(KEY_PATCH_SIZE, hparams.patch_size); - get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false); get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy get_u32(KEY_MINICPMV_QUERY_NUM, hparams.minicpmv_query_num, false); if (hparams.minicpmv_query_num == 0) { @@ -1075,11 +1074,6 @@ struct clip_model_loader { // default warmup value hparams.warmup_image_size = hparams.image_size; - hparams.has_llava_projector = model.proj_type == PROJECTOR_TYPE_MLP - || model.proj_type == PROJECTOR_TYPE_MLP_NORM - || model.proj_type == PROJECTOR_TYPE_LDP - || model.proj_type == PROJECTOR_TYPE_LDPV2; - { bool use_gelu = false; bool use_silu = false; @@ -1135,14 +1129,41 @@ struct clip_model_loader { // model-specific params switch (model.proj_type) { + case PROJECTOR_TYPE_MLP: + case PROJECTOR_TYPE_MLP_NORM: + case PROJECTOR_TYPE_LDP: + case PROJECTOR_TYPE_LDPV2: + case PROJECTOR_TYPE_COGVLM: + { + hparams.has_llava_projector = model.proj_type != PROJECTOR_TYPE_COGVLM; + hparams.image_pad_color = {122, 116, 104}; + if (!hparams.image_res_candidates.empty()) { + hparams.image_resize_pad = true; + hparams.image_resize_algo = RESIZE_ALGO_BILINEAR; + } else { + // llava-1.6 default params + hparams.image_pad_ov = false; + hparams.image_pad_rf = true; + hparams.image_pad_color_rf = {122, 116, 104}; + hparams.image_resize_algo_rf = RESIZE_ALGO_BICUBIC; + hparams.image_resize_algo_ov = RESIZE_ALGO_BILINEAR; + } + } break; + case PROJECTOR_TYPE_GLM_EDGE: + { + hparams.image_resize_pad = true; + hparams.image_resize_algo = RESIZE_ALGO_BILINEAR; + } break; case PROJECTOR_TYPE_MINICPMV: { + // use default llava-uhd preprocessing params if (hparams.minicpmv_version == 0) { hparams.minicpmv_version = 2; // default to 2 if not set } } break; case PROJECTOR_TYPE_INTERNVL: { + // use default llava-uhd preprocessing params // older version of internvl doesn't have min/max tiles, we need to provide default values for them to avoid issues hparams.preproc_min_tiles = 1; hparams.preproc_max_tiles = 12; @@ -1158,11 +1179,15 @@ struct clip_model_loader { } break; case PROJECTOR_TYPE_IDEFICS3: { + // use default llava-uhd preprocessing params get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge, false); } break; case PROJECTOR_TYPE_LFM2: { + hparams.image_resize_algo = RESIZE_ALGO_BILINEAR; + hparams.image_resize_algo_rf = RESIZE_ALGO_BILINEAR; + hparams.image_resize_algo_ov = RESIZE_ALGO_BILINEAR; get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); // ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json hparams.set_limit_image_tokens(64, 256); @@ -1170,6 +1195,7 @@ struct clip_model_loader { case PROJECTOR_TYPE_PHI4: { hparams.n_merge = 1; + hparams.image_resize_algo = RESIZE_ALGO_BILINEAR; get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels); get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels); hparams.set_warmup_n_tokens(16*16); @@ -1179,6 +1205,7 @@ struct clip_model_loader { // ref: https://huggingface.co/mistral-community/pixtral-12b/blob/main/preprocessor_config.json // TODO: verify the image_min_tokens hparams.n_merge = 1; // the original pixtral does not use patch merging + hparams.image_resize_algo = RESIZE_ALGO_BILINEAR; hparams.rope_theta = 10000.0f; get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); hparams.set_limit_image_tokens(8, 1024); @@ -1187,6 +1214,7 @@ struct clip_model_loader { case PROJECTOR_TYPE_LIGHTONOCR: { hparams.n_merge = 1; + hparams.image_resize_algo = RESIZE_ALGO_BICUBIC; hparams.rope_theta = 10000.0f; get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); hparams.image_longest_edge = hparams.image_size; @@ -1195,6 +1223,7 @@ struct clip_model_loader { } break; case PROJECTOR_TYPE_KIMIVL: { + hparams.image_resize_algo = RESIZE_ALGO_BILINEAR; hparams.rope_theta = 10000.0f; get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); // TODO: check kimivl preprocessor for exact values @@ -1203,6 +1232,7 @@ struct clip_model_loader { } break; case PROJECTOR_TYPE_KIMIK25: { + hparams.image_resize_algo = RESIZE_ALGO_BICUBIC; hparams.rope_theta = 10000.0f; get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); @@ -1222,6 +1252,7 @@ struct clip_model_loader { // default value (used by all model sizes in gemma 3 family) // number of patches for each **side** is reduced by a factor of 4 hparams.n_merge = 4; + hparams.image_resize_algo = RESIZE_ALGO_BILINEAR; // test model (tinygemma3) has a different value, we optionally read it get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); } break; @@ -1238,6 +1269,7 @@ struct clip_model_loader { case PROJECTOR_TYPE_QWEN3VL: { hparams.n_merge = 2; // default value for Qwen 2 and 2.5 + hparams.image_resize_algo = RESIZE_ALGO_BILINEAR; get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern, model.proj_type == PROJECTOR_TYPE_QWEN25VL); // only 2.5 requires it // ref: https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json @@ -1253,6 +1285,8 @@ struct clip_model_loader { case PROJECTOR_TYPE_YOUTUVL: { hparams.n_merge = 2; + hparams.image_resize_algo = RESIZE_ALGO_BILINEAR; + hparams.image_resize_pad = false; get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true); std::vector wa_layer_indexes_vec; @@ -1268,6 +1302,7 @@ struct clip_model_loader { { hparams.rope_theta = 10000.0f; hparams.n_merge = 2; // default value for GLM4-V + hparams.image_resize_algo = RESIZE_ALGO_BILINEAR; get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); hparams.set_limit_image_tokens(8, 4096); hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup @@ -1301,6 +1336,7 @@ struct clip_model_loader { case PROJECTOR_TYPE_PADDLEOCR: { hparams.n_merge = 2; + hparams.image_resize_algo = RESIZE_ALGO_BILINEAR; get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels); get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels); @@ -1311,6 +1347,10 @@ struct clip_model_loader { hparams.patch_size = 16; hparams.image_size = 1024; hparams.warmup_image_size = 1024; + hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW; + hparams.image_pad_color[0] = hparams.image_mean[0]; + hparams.image_pad_color[1] = hparams.image_mean[1]; + hparams.image_pad_color[2] = hparams.image_mean[2]; get_u32(KEY_SAM_N_BLOCK, hparams.sam_n_layer, true); get_u32(KEY_SAM_N_HEAD, hparams.sam_n_head, true); @@ -1326,8 +1366,13 @@ struct clip_model_loader { hparams.audio_window_len = 400; hparams.audio_hop_len = 160; } break; + case PROJECTOR_TYPE_JANUS_PRO: + { + hparams.image_pad_color = {127, 127, 127}; + hparams.image_resize_algo = RESIZE_ALGO_BILINEAR; + } break; default: - break; + throw std::runtime_error(string_format("%s: unknown vision projector type %s\n", __func__, proj_type.c_str())); } // sanity check @@ -2385,1397 +2430,6 @@ void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny memcpy(img->buf.data(), rgb_pixels, img->buf.size()); } -// Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not -static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) { - dst.nx = src.nx; - dst.ny = src.ny; - dst.buf.resize(src.buf.size()); - - // TODO @ngxson : seems like this could be done more efficiently on cgraph - for (size_t i = 0; i < src.buf.size(); ++i) { - int c = i % 3; // rgb - dst.buf[i] = (static_cast(src.buf[i]) / 255.0f - mean[c]) / std[c]; - } -} - -// set of tools to manipulate images -// in the future, we can have HW acceleration by allowing this struct to access 3rd party lib like imagick or opencv -struct img_tool { - enum resize_algo { - RESIZE_ALGO_BILINEAR, - RESIZE_ALGO_BICUBIC, - RESIZE_ALGO_BICUBIC_PILLOW, - // RESIZE_ALGO_LANCZOS, // TODO - }; - - static void resize( - const clip_image_u8 & src, - clip_image_u8 & dst, - const clip_image_size & target_resolution, - resize_algo algo, - bool add_padding = true, // TODO: define the behavior for add_padding = false - std::array pad_color = {0, 0, 0}) { - dst.nx = target_resolution.width; - dst.ny = target_resolution.height; - dst.buf.resize(3 * dst.nx * dst.ny); - - if (dst.nx == src.nx && dst.ny == src.ny) { - // no resize needed, simple copy - dst.buf = src.buf; - return; - } - - if (!add_padding) { - // direct resize - switch (algo) { - case RESIZE_ALGO_BILINEAR: - resize_bilinear(src, dst, target_resolution.width, target_resolution.height); - break; - case RESIZE_ALGO_BICUBIC: - resize_bicubic(src, dst, target_resolution.width, target_resolution.height); - break; - case RESIZE_ALGO_BICUBIC_PILLOW: - resize_bicubic_pillow(src, dst, target_resolution.width, target_resolution.height); - break; - default: - throw std::runtime_error("Unsupported resize algorithm"); - } - } else { - // resize with padding - clip_image_u8 resized_image; - float scale_w = static_cast(target_resolution.width) / src.nx; - float scale_h = static_cast(target_resolution.height) / src.ny; - float scale = std::min(scale_w, scale_h); - int new_width = std::min(static_cast(std::ceil(src.nx * scale)), target_resolution.width); - int new_height = std::min(static_cast(std::ceil(src.ny * scale)), target_resolution.height); - - switch (algo) { - case RESIZE_ALGO_BILINEAR: - resize_bilinear(src, resized_image, new_width, new_height); - break; - case RESIZE_ALGO_BICUBIC: - resize_bicubic(src, resized_image, new_width, new_height); - break; - case RESIZE_ALGO_BICUBIC_PILLOW: - resize_bicubic_pillow(src, resized_image, new_width, new_height); - break; - default: - throw std::runtime_error("Unsupported resize algorithm"); - } - - // fill dst with pad_color - fill(dst, pad_color); - - int offset_x = (target_resolution.width - new_width) / 2; - int offset_y = (target_resolution.height - new_height) / 2; - - composite(dst, resized_image, offset_x, offset_y); - } - } - - static void crop(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) { - dst.nx = w; - dst.ny = h; - dst.buf.resize(3 * w * h); - - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; ++j) { - int src_idx = 3 * ((y + i)*image.nx + (x + j)); - int dst_idx = 3 * (i*w + j); - dst.buf[dst_idx] = image.buf[src_idx]; - dst.buf[dst_idx + 1] = image.buf[src_idx + 1]; - dst.buf[dst_idx + 2] = image.buf[src_idx + 2]; - } - } - } - - // calculate the size of the **resized** image, while preserving the aspect ratio - // the calculated size will be aligned to the nearest multiple of align_size - // if H or W size is larger than longest_edge, it will be resized to longest_edge - static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int longest_edge) { - GGML_ASSERT(align_size > 0); - if (inp_size.width <= 0 || inp_size.height <= 0 || longest_edge <= 0) { - return {0, 0}; - } - - float scale = std::min(static_cast(longest_edge) / inp_size.width, - static_cast(longest_edge) / inp_size.height); - - float target_width_f = static_cast(inp_size.width) * scale; - float target_height_f = static_cast(inp_size.height) * scale; - - auto ceil_by_factor = [f = align_size](float x) { return static_cast(std::ceil(x / static_cast(f))) * f; }; - int aligned_width = ceil_by_factor(target_width_f); - int aligned_height = ceil_by_factor(target_height_f); - - return {aligned_width, aligned_height}; - } - - // calculate the size of the **resized** image, while preserving the aspect ratio - // the calculated size will have min_pixels <= W*H <= max_pixels - // this is referred as "smart_resize" in transformers code - static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int min_pixels, const int max_pixels) { - GGML_ASSERT(align_size > 0); - const int width = inp_size.width; - const int height = inp_size.height; - - auto round_by_factor = [f = align_size](float x) { return static_cast(std::round(x / static_cast(f))) * f; }; - auto ceil_by_factor = [f = align_size](float x) { return static_cast(std::ceil(x / static_cast(f))) * f; }; - auto floor_by_factor = [f = align_size](float x) { return static_cast(std::floor(x / static_cast(f))) * f; }; - - // always align up first - int h_bar = std::max(align_size, round_by_factor(height)); - int w_bar = std::max(align_size, round_by_factor(width)); - - if (h_bar * w_bar > max_pixels) { - const auto beta = std::sqrt(static_cast(height * width) / max_pixels); - h_bar = std::max(align_size, floor_by_factor(height / beta)); - w_bar = std::max(align_size, floor_by_factor(width / beta)); - } else if (h_bar * w_bar < min_pixels) { - const auto beta = std::sqrt(static_cast(min_pixels) / (height * width)); - h_bar = ceil_by_factor(height * beta); - w_bar = ceil_by_factor(width * beta); - } - - return {w_bar, h_bar}; - } - - // draw src image into dst image at offset (offset_x, offset_y) - static void composite(clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) { - for (int y = 0; y < src.ny; ++y) { - for (int x = 0; x < src.nx; ++x) { - int dx = x + offset_x; - int dy = y + offset_y; - // skip pixels that would be out of bounds in the destination - if (dx < 0 || dy < 0 || dx >= dst.nx || dy >= dst.ny) { - continue; - } - size_t dst_idx = 3 * (static_cast(dy) * dst.nx + static_cast(dx)); - size_t src_idx = 3 * (static_cast(y) * src.nx + static_cast(x)); - dst.buf[dst_idx + 0] = src.buf[src_idx + 0]; - dst.buf[dst_idx + 1] = src.buf[src_idx + 1]; - dst.buf[dst_idx + 2] = src.buf[src_idx + 2]; - } - } - } - - // fill the image with a solid color - static void fill(clip_image_u8 & img, const std::array & color) { - for (size_t i = 0; i < img.buf.size(); i += 3) { - img.buf[i] = color[0]; - img.buf[i + 1] = color[1]; - img.buf[i + 2] = color[2]; - } - } - -private: - // Bilinear resize function - static void resize_bilinear(const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) { - dst.nx = target_width; - dst.ny = target_height; - dst.buf.resize(3 * target_width * target_height); - - float x_ratio = static_cast(src.nx - 1) / target_width; - float y_ratio = static_cast(src.ny - 1) / target_height; - - for (int y = 0; y < target_height; y++) { - for (int x = 0; x < target_width; x++) { - float px = x_ratio * x; - float py = y_ratio * y; - int x_floor = static_cast(px); - int y_floor = static_cast(py); - float x_lerp = px - x_floor; - float y_lerp = py - y_floor; - - for (int c = 0; c < 3; c++) { - float top = lerp( - static_cast(src.buf[3 * (y_floor * src.nx + x_floor) + c]), - static_cast(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]), - x_lerp - ); - float bottom = lerp( - static_cast(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]), - static_cast(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]), - x_lerp - ); - dst.buf[3 * (y * target_width + x) + c] = static_cast(lerp(top, bottom, y_lerp)); - } - } - } - } - - // Bicubic resize function - // part of image will be cropped if the aspect ratio is different - static bool resize_bicubic(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) { - const int nx = img.nx; - const int ny = img.ny; - - dst.nx = target_width; - dst.ny = target_height; - dst.buf.resize(3 * target_width * target_height); - - float Cc; - float C[5] = {}; - float d0, d2, d3, a0, a1, a2, a3; - int i, j, k, jj; - int x, y; - float dx, dy; - float tx, ty; - - tx = (float)nx / (float)target_width; - ty = (float)ny / (float)target_height; - - // Bicubic interpolation; adapted from ViT.cpp, inspired from : - // -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36 - // -> https://en.wikipedia.org/wiki/Bicubic_interpolation - - for (i = 0; i < target_height; i++) { - for (j = 0; j < target_width; j++) { - x = (int)(tx * j); - y = (int)(ty * i); - - dx = tx * j - x; - dy = ty * i - y; - - for (k = 0; k < 3; k++) { - for (jj = 0; jj <= 3; jj++) { - d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; - d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; - d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; - a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; - - a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; - a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; - a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; - - C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx; - - d0 = C[0] - C[1]; - d2 = C[2] - C[1]; - d3 = C[3] - C[1]; - a0 = C[1]; - a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; - a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; - a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; - Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy; - - const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f); - dst.buf[(i * target_width + j) * 3 + k] = float(Cc2); - } - } - } - } - - return true; - } - - // Bicubic resize function using Pillow's ImagingResample algorithm - // Adapted from https://github.com/python-pillow/Pillow/blob/main/src/libImaging/Resample.c - // - // Key Difference with resize_bicubic: - // 1. Uses separable filtering: horizontal pass followed by vertical pass - // 2. Pre-computes normalized filter coefficients for each output pixel - // 3. Applies convolution using fixed-point integer arithmetic for performance - static bool resize_bicubic_pillow(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) { - // Fixed-point precision: 22 bits = 32 (int32_t) - 8 (uint8_t pixels) - 2 (headroom for accumulation) - // This allows encoding fractional weights as integers: weight * 2^22 - const int PRECISION_BITS = 32 - 8 - 2; - - // Bicubic filter function with a = -0.5 (Note that GGML/PyTorch takes a = -0.75) - // Returns filter weight for distance x from pixel center - // Support: [-2, 2], meaning the filter influences pixels within 2 units of distance - auto bicubic_filter = [](double x) -> double { - constexpr double a = -0.5; - if (x < 0.0) { - x = -x; - } - if (x < 1.0) { - return ((a + 2.0) * x - (a + 3.0)) * x * x + 1; - } - if (x < 2.0) { - return (((x - 5) * x + 8) * x - 4) * a; - } - return 0.0; // Zero outside [-2, 2] - }; - - // Filter support radius: bicubic extends 2 pixels in each direction - constexpr double filter_support = 2.0; - - // Clipping function for 8-bit values - auto clip8 = [](int val) -> uint8_t { - if (val < 0) return 0; - if (val > 255) return 255; - return static_cast(val); - }; - - // Precompute filter coefficients for ONE dimension (horizontal or vertical) - // - // Parameters: - // inSize - Number of pixels in input dimension (e.g., src_width or src_height) - // outSize - Number of pixels in output dimension (e.g., target_width or target_height) - // bounds - [OUTPUT] Array of size outSize*2 storing input pixel ranges: - // bounds[xx*2+0] = first input pixel index for output pixel xx (xmin) - // bounds[xx*2+1] = number of input pixels for output pixel xx (xcnt) - // weights - [OUTPUT] Array of size outSize*ksize storing fixed-point filter weights: - // kk[xx*ksize + x] = weight for input pixel x contributing to output pixel xx - // - // Returns: kernel size (ksize) - number of input pixels that contribute to each output pixel - auto precompute_weights = [&](int inSize, int outSize, - std::vector & bounds, std::vector & weights) -> int { - double support, scale, filterscale; - double center, ww, ss; - int xx, x, ksize, xmin, xmax, xcnt; - - // Calculate scaling factor: ratio of input range to output size - filterscale = scale = (double)inSize / outSize; - // For upsampling (scale < 1), keep filterscale = 1 to maintain filter sharpness - // For downsampling (scale > 1), widen filter to prevent aliasing - if (filterscale < 1.0) { - filterscale = 1.0; - } - - // Determine filter support radius and kernel size - support = filter_support * filterscale; // Widen filter when downsampling - ksize = static_cast(std::ceil(support)) * 2 + 1; // Total pixels in kernel - - std::vector pre_weights(outSize * ksize); // Temporary weights - bounds.resize(outSize * 2); - - // For each output pixel, compute its filter coefficients - for (xx = 0; xx < outSize; xx++) { - // Calculate the center position in input space (pixel-center convention: +0.5) - center = (xx + 0.5) * scale; - ww = 0.0; // Sum of weights for normalization - ss = 1.0 / filterscale; // Scale factor for filter function - - // Determine the range of input pixels that contribute to this output pixel - xmin = static_cast(center - support + 0.5); - if (xmin < 0) { - xmin = 0; - } - - xmax = static_cast(center + support + 0.5); - if (xmax > inSize) { - xmax = inSize; - } - - xcnt = xmax - xmin; - - // Compute filter weights for each contributing input pixel - for (x = 0; x < xcnt; x++) { - // Distance from input pixel center to output pixel center in input space - double w = bicubic_filter((x + xmin - center + 0.5) * ss); - pre_weights[xx * ksize + x] = w; - ww += w; // Accumulate for normalization - } - - // Normalize weights to sum to 1.0 (preserves brightness) - for (x = 0; x < xcnt; x++) { - if (ww != 0.0) { - pre_weights[xx * ksize + x] /= ww; - } - } - - // Zero-pad remaining kernel positions - for (; x < ksize; x++) { - pre_weights[xx * ksize + x] = 0; - } - - // Store input pixel range for this output pixel - bounds[xx * 2 + 0] = xmin; - bounds[xx * 2 + 1] = xcnt; - } - - // Convert floating-point coefficients to fixed-point integers - // Formula: int32 = round(float * 2^PRECISION_BITS) - weights.resize(outSize * ksize); - for (int i = 0; i < outSize * ksize; i++) { - if (pre_weights[i] < 0) { - weights[i] = static_cast(-0.5 + pre_weights[i] * (1 << PRECISION_BITS)); - } else { - weights[i] = static_cast(0.5 + pre_weights[i] * (1 << PRECISION_BITS)); - } - } - - return ksize; - }; - - // Horizontal resampling pass - // Resizes width from imIn.nx to imOut.nx, preserving height - auto resample_horizontal = [&](const clip_image_u8 & imIn, clip_image_u8 & imOut, - int ksize, const std::vector & bounds, const std::vector & weights) { - imOut.ny = imIn.ny; - imOut.buf.resize(3 * imOut.nx * imOut.ny); - - // Process each row independently - for (int yy = 0; yy < imOut.ny; yy++) { - // For each output pixel in this row - for (int xx = 0; xx < imOut.nx; xx++) { - // Get the range of input pixels and filter coefficients - int xmin = bounds[xx * 2 + 0]; // First input pixel index - int xcnt = bounds[xx * 2 + 1]; // Number of input pixels - - // Initialize accumulators for RGB channels with rounding bias (0.5 in fixed-point) - int32_t ss0 = 1 << (PRECISION_BITS - 1); - int32_t ss1 = 1 << (PRECISION_BITS - 1); - int32_t ss2 = 1 << (PRECISION_BITS - 1); - - // Convolve: sum weighted input pixels - for (int x = 0; x < xcnt; x++) { - int src_idx = ((yy * imIn.nx) + (x + xmin)) * 3; - ss0 += static_cast(imIn.buf[src_idx + 0]) * weights[xx * ksize + x]; // R channel - ss1 += static_cast(imIn.buf[src_idx + 1]) * weights[xx * ksize + x]; // G channel - ss2 += static_cast(imIn.buf[src_idx + 2]) * weights[xx * ksize + x]; // B channel - } - - // Convert back from fixed-point (divide by 2^PRECISION_BITS) and clamp to [0,255] - int dst_idx = (yy * imOut.nx + xx) * 3; - imOut.buf[dst_idx + 0] = clip8(ss0 >> PRECISION_BITS); - imOut.buf[dst_idx + 1] = clip8(ss1 >> PRECISION_BITS); - imOut.buf[dst_idx + 2] = clip8(ss2 >> PRECISION_BITS); - } - } - }; - - // Vertical resampling pass - // Resizes height from imIn.ny to imOut.ny, preserving width - auto resample_vertical = [&](const clip_image_u8 & imIn, clip_image_u8 & imOut, - int ksize, const std::vector & bounds, const std::vector & weight) { - imOut.nx = imIn.nx; - imOut.buf.resize(3 * imOut.nx * imOut.ny); - - // For each output row - for (int yy = 0; yy < imOut.ny; yy++) { - // Get the range of input rows and filter coefficients - int ymin = bounds[yy * 2 + 0]; // First input row index - int ycnt = bounds[yy * 2 + 1]; // Number of input rows - - // Process each column in this output row - for (int xx = 0; xx < imOut.nx; xx++) { - // Initialize accumulators for RGB channels with rounding bias - int32_t ss0 = 1 << (PRECISION_BITS - 1); - int32_t ss1 = 1 << (PRECISION_BITS - 1); - int32_t ss2 = 1 << (PRECISION_BITS - 1); - - // Convolve: sum weighted input pixels vertically - for (int y = 0; y < ycnt; y++) { - int src_idx = ((y + ymin) * imIn.nx + xx) * 3; - ss0 += static_cast(imIn.buf[src_idx + 0]) * weight[yy * ksize + y]; // R channel - ss1 += static_cast(imIn.buf[src_idx + 1]) * weight[yy * ksize + y]; // G channel - ss2 += static_cast(imIn.buf[src_idx + 2]) * weight[yy * ksize + y]; // B channel - } - - // Convert back from fixed-point and clamp to [0,255] - int dst_idx = (yy * imOut.nx + xx) * 3; - imOut.buf[dst_idx + 0] = clip8(ss0 >> PRECISION_BITS); - imOut.buf[dst_idx + 1] = clip8(ss1 >> PRECISION_BITS); - imOut.buf[dst_idx + 2] = clip8(ss2 >> PRECISION_BITS); - } - } - }; - - // Main resampling logic using separable two-pass approach - const int src_width = img.nx; - const int src_height = img.ny; - - dst.nx = target_width; - dst.ny = target_height; - - bool need_horizontal = (target_width != src_width); - bool need_vertical = (target_height != src_height); - - // Precompute filter coefficients for both dimensions - std::vector bounds_horiz, bounds_vert; - std::vector weights_horiz, weights_vert; - int ksize_horiz = 0, ksize_vert = 0; - - if (need_horizontal) { - ksize_horiz = precompute_weights(src_width, target_width, bounds_horiz, weights_horiz); - } - - if (need_vertical) { - ksize_vert = precompute_weights(src_height, target_height, bounds_vert, weights_vert); - } - - // Perform two-pass resampling - if (need_horizontal && need_vertical) { - // Both horizontal and vertical - clip_image_u8 temp; - temp.nx = target_width; - resample_horizontal(img, temp, ksize_horiz, bounds_horiz, weights_horiz); - resample_vertical(temp, dst, ksize_vert, bounds_vert, weights_vert); - } else if (need_horizontal) { - // Only horizontal - resample_horizontal(img, dst, ksize_horiz, bounds_horiz, weights_horiz); - } else if (need_vertical) { - // Only vertical - resample_vertical(img, dst, ksize_vert, bounds_vert, weights_vert); - } else { - // No resizing needed - direct copy - dst.buf = img.buf; - } - - return true; - } - - static inline int clip(int x, int lower, int upper) { - return std::max(lower, std::min(x, upper)); - } - - // Linear interpolation between two points - static inline float lerp(float s, float e, float t) { - return s + (e - s) * t; - } -}; - -/** - * implementation of LLaVA-UHD: - * - https://arxiv.org/pdf/2403.11703 - * - https://github.com/thunlp/LLaVA-UHD - * - https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118 - * - * overview: - * - an image always have a single overview (downscaled image) - * - an image can have 0 or multiple slices, depending on the image size - * - each slice can then be considered as a separate image - * - * for example: - * - * [overview] --> [slice 1] --> [slice 2] - * | | - * +--> [slice 3] --> [slice 4] - */ -struct llava_uhd { - struct slice_coordinates { - int x; - int y; - clip_image_size size; - }; - - struct slice_instructions { - clip_image_size overview_size; // size of downscaled image - clip_image_size refined_size; // size of image right before slicing (must be multiple of slice size) - clip_image_size grid_size; // grid_size.width * grid_size.height = number of slices - std::vector slices; - - img_tool::resize_algo interpolation_overview = img_tool::RESIZE_ALGO_BILINEAR; - bool padding_overview = false; // if true, refine image will be padded to the grid size (e.g. llava-1.6) - std::array pad_color_overview = {0, 0, 0}; - - img_tool::resize_algo interpolation_refined = img_tool::RESIZE_ALGO_BICUBIC; - bool padding_refined = false; // if true, refine image will be padded to the grid size (e.g. llava-1.6) - std::array pad_color_refined = {0, 0, 0}; - }; - - static slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) { - slice_instructions res; - const int patch_size = clip_get_patch_size(ctx); - const int slice_size = clip_get_image_size(ctx); - const int original_width = original_size.width; - const int original_height = original_size.height; - - const bool has_slices = original_size.width > slice_size || original_size.height > slice_size; - const bool has_pinpoints = !ctx->model.hparams.image_res_candidates.empty(); - - if (!has_slices) { - // skip slicing logic - res.overview_size = clip_image_size{slice_size, slice_size}; - res.refined_size = clip_image_size{0, 0}; - res.grid_size = clip_image_size{0, 0}; - - return res; - } - - if (has_pinpoints) { - // has pinpoints, use them to calculate the grid size (e.g. llava-1.6) - auto refine_size = llava_uhd::select_best_resolution( - original_size, - ctx->model.hparams.image_res_candidates); - res.overview_size = clip_image_size{slice_size, slice_size}; - res.refined_size = refine_size; - res.grid_size = clip_image_size{0, 0}; - res.padding_refined = true; - res.interpolation_refined = img_tool::RESIZE_ALGO_BILINEAR; // preserve old behavior when padding - - LOG_DBG("%s: using pinpoints for slicing\n", __func__); - LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d\n", - __func__, original_width, original_height, - res.overview_size.width, res.overview_size.height, - res.refined_size.width, res.refined_size.height); - - for (int y = 0; y < refine_size.height; y += slice_size) { - for (int x = 0; x < refine_size.width; x += slice_size) { - slice_coordinates slice; - slice.x = x; - slice.y = y; - slice.size.width = std::min(slice_size, refine_size.width - x); - slice.size.height = std::min(slice_size, refine_size.height - y); - res.slices.push_back(slice); - LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n", - __func__, (int)res.slices.size() - 1, - slice.x, slice.y, slice.size.width, slice.size.height); - } - } - - res.grid_size.height = refine_size.height / slice_size; - res.grid_size.width = refine_size.width / slice_size; - LOG_DBG("%s: grid size: %d x %d\n", __func__, res.grid_size.width, res.grid_size.height); - - return res; - } - - // no pinpoints, dynamically calculate the grid size (e.g. minicpmv) - - auto best_size = get_best_resize(original_size, slice_size, patch_size, !has_slices); - res.overview_size = best_size; - - { - const int max_slice_nums = 9; // TODO: this is only used by minicpmv, maybe remove it - const float log_ratio = log((float)original_width / original_height); - const float ratio = (float)original_width * original_height / (slice_size * slice_size); - const int multiple = fmin(ceil(ratio), max_slice_nums); - - auto best_grid = get_best_grid(max_slice_nums, multiple, log_ratio); - auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true); - res.grid_size = best_grid; - res.refined_size = refine_size; - - LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n", - __func__, original_width, original_height, - res.overview_size.width, res.overview_size.height, - res.refined_size.width, res.refined_size.height, - res.grid_size.width, res.grid_size.height); - - int width = refine_size.width; - int height = refine_size.height; - int grid_x = int(width / best_grid.width); - int grid_y = int(height / best_grid.height); - for (int patches_y = 0, ic = 0; - patches_y < refine_size.height && ic < best_grid.height; - patches_y += grid_y, ic += 1) { - for (int patches_x = 0, jc = 0; - patches_x < refine_size.width && jc < best_grid.width; - patches_x += grid_x, jc += 1) { - slice_coordinates slice; - slice.x = patches_x; - slice.y = patches_y; - slice.size.width = grid_x; - slice.size.height = grid_y; - res.slices.push_back(slice); - LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n", - __func__, (int)res.slices.size() - 1, - slice.x, slice.y, slice.size.width, slice.size.height); - } - } - } - - return res; - } - - static std::vector slice_image(const clip_image_u8 * img, const slice_instructions & inst, bool overview_first = true) { - std::vector output; - - // resize to overview size - clip_image_u8_ptr resized_img(clip_image_u8_init()); - img_tool::resize(*img, *resized_img, inst.overview_size, inst.interpolation_overview, - inst.padding_overview, inst.pad_color_overview); - if (overview_first) { - output.push_back(std::move(resized_img)); - } - - if (inst.slices.empty()) { - // no slices, just return the resized image - if (!overview_first) { - output.push_back(std::move(resized_img)); - } - return output; - } - - // resize to refined size - clip_image_u8_ptr refined_img(clip_image_u8_init()); - img_tool::resize(*img, *refined_img, inst.refined_size, inst.interpolation_refined, - inst.padding_refined, inst.pad_color_refined); - - // create slices - for (const auto & slice : inst.slices) { - int x = slice.x; - int y = slice.y; - int w = slice.size.width; - int h = slice.size.height; - - clip_image_u8_ptr img_slice(clip_image_u8_init()); - img_tool::crop(*refined_img, *img_slice, x, y, w, h); - output.push_back(std::move(img_slice)); - } - - if (!overview_first) { - output.push_back(std::move(resized_img)); - } - - return output; - } - -private: - static clip_image_size get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale = false) { - int width = original_size.width; - int height = original_size.height; - if ((width * height > scale_resolution * scale_resolution) || allow_upscale) { - float r = static_cast(width) / height; - height = static_cast(scale_resolution / std::sqrt(r)); - width = static_cast(height * r); - } - clip_image_size res; - res.width = ensure_divide(width, patch_size); - res.height = ensure_divide(height, patch_size); - return res; - } - - static clip_image_size resize_maintain_aspect_ratio(const clip_image_size & orig, const clip_image_size & target_max) { - float scale_width = static_cast(target_max.width) / orig.width; - float scale_height = static_cast(target_max.height) / orig.height; - float scale = std::min(scale_width, scale_height); - return clip_image_size{ - static_cast(orig.width * scale), - static_cast(orig.height * scale), - }; - } - - /** - * Selects the best resolution from a list of possible resolutions based on the original size. - * - * For example, when given a list of resolutions: - * - 100x100 - * - 200x100 - * - 100x200 - * - 200x200 - * - * And an input image of size 111x200, then 100x200 is the best fit (least wasted resolution). - * - * @param original_size The original size of the image - * @param possible_resolutions A list of possible resolutions - * @return The best fit resolution - */ - static clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector & possible_resolutions) { - clip_image_size best_fit; - int min_wasted_area = std::numeric_limits::max(); - int max_effective_resolution = 0; - - for (const clip_image_size & candidate : possible_resolutions) { - auto target_size = resize_maintain_aspect_ratio(original_size, candidate); - int effective_resolution = std::min( - target_size.width * target_size.height, - original_size.width * original_size.height); - int wasted_area = (candidate.width * candidate.height) - effective_resolution; - - if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_area < min_wasted_area)) { - max_effective_resolution = effective_resolution; - min_wasted_area = wasted_area; - best_fit = candidate; - } - - LOG_DBG("%s: candidate: %d x %d, target: %d x %d, wasted: %d, effective: %d\n", __func__, candidate.width, candidate.height, target_size.width, target_size.height, wasted_area, effective_resolution); - } - - return best_fit; - } - - static int ensure_divide(int length, int patch_size) { - return std::max(static_cast(std::round(static_cast(length) / patch_size) * patch_size), patch_size); - } - - static clip_image_size get_refine_size(const clip_image_size & original_size, const clip_image_size & grid, int scale_resolution, int patch_size, bool allow_upscale = false) { - int width = original_size.width; - int height = original_size.height; - int grid_x = grid.width; - int grid_y = grid.height; - - int refine_width = ensure_divide(width, grid_x); - int refine_height = ensure_divide(height, grid_y); - - clip_image_size grid_size; - grid_size.width = refine_width / grid_x; - grid_size.height = refine_height / grid_y; - - auto best_grid_size = get_best_resize(grid_size, scale_resolution, patch_size, allow_upscale); - int best_grid_width = best_grid_size.width; - int best_grid_height = best_grid_size.height; - - clip_image_size refine_size; - refine_size.width = best_grid_width * grid_x; - refine_size.height = best_grid_height * grid_y; - return refine_size; - } - - static clip_image_size get_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) { - std::vector candidate_split_grids_nums; - for (int i : {multiple - 1, multiple, multiple + 1}) { - if (i == 1 || i > max_slice_nums) { - continue; - } - candidate_split_grids_nums.push_back(i); - } - - std::vector candidate_grids; - for (int split_grids_nums : candidate_split_grids_nums) { - int m = 1; - while (m <= split_grids_nums) { - if (split_grids_nums % m == 0) { - candidate_grids.push_back(clip_image_size{m, split_grids_nums / m}); - } - ++m; - } - } - - clip_image_size best_grid{1, 1}; - float min_error = std::numeric_limits::infinity(); - for (const auto& grid : candidate_grids) { - float error = std::abs(log_ratio - std::log(1.0 * grid.width / grid.height)); - if (error < min_error) { - best_grid = grid; - min_error = error; - } - } - return best_grid; - } -}; - -// ref: https://github.com/huggingface/transformers/blob/v5.1.0/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py -// some of the logic is similar to llava_uhd, but with different hyperparameters and some logic is unique (e.g. grid layout) -struct lfm2_vl_image_processor { - // ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json - static constexpr int min_tiles = 2; - static constexpr int max_tiles = 10; - static constexpr float max_pixels_tolerance = 2.0f; - static constexpr int tile_size = 512; - - static llava_uhd::slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) { - llava_uhd::slice_instructions inst; - const auto & params = ctx->model.hparams; - const int align_size = params.patch_size * params.n_merge; - - inst.interpolation_overview = img_tool::RESIZE_ALGO_BILINEAR; - inst.interpolation_refined = img_tool::RESIZE_ALGO_BILINEAR; - inst.overview_size = img_tool::calc_size_preserved_ratio(original_size, align_size, params.image_min_pixels, params.image_max_pixels); - - // tile if either dimension exceeds tile_size with tolerance - const bool needs_tiling = original_size.width > tile_size * max_pixels_tolerance || original_size.height > tile_size * max_pixels_tolerance; - - if (!needs_tiling) { - inst.refined_size = clip_image_size{0, 0}; - inst.grid_size = clip_image_size{0, 0}; - return inst; - } - - const clip_image_size grid = get_grid_layout(original_size.height, original_size.width); - - inst.grid_size = grid; - inst.refined_size = clip_image_size{tile_size * grid.width, tile_size * grid.height}; - - LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n", - __func__, - original_size.width, original_size.height, - inst.overview_size.width, inst.overview_size.height, - inst.refined_size.width, inst.refined_size.height, - grid.width, grid.height); - - for (int row = 0; row < grid.height; row++) { - for (int col = 0; col < grid.width; col++) { - llava_uhd::slice_coordinates slice; - slice.x = col * tile_size; - slice.y = row * tile_size; - slice.size = clip_image_size{tile_size, tile_size}; - inst.slices.push_back(slice); - LOG_DBG("%s: slice %d: x=%d, y=%d, size=%d x %d\n", - __func__, (int)inst.slices.size() - 1, - slice.x, slice.y, slice.size.width, slice.size.height); - } - } - - return inst; - } - -private: - static clip_image_size find_closest_aspect_ratio( - float aspect_ratio, - const std::vector & target_ratios, - int width, int height) { - float best_ratio_diff = std::numeric_limits::max(); - clip_image_size best_ratio = {1, 1}; - const float area = static_cast(width * height); - - for (const auto & ratio : target_ratios) { - const float target_aspect_ratio = static_cast(ratio.width) / ratio.height; - const float ratio_diff = std::abs(aspect_ratio - target_aspect_ratio); - if (ratio_diff < best_ratio_diff) { - best_ratio_diff = ratio_diff; - best_ratio = ratio; - } else if (ratio_diff == best_ratio_diff) { - const float target_area = static_cast(tile_size * tile_size * ratio.width * ratio.height); - if (area > 0.5f * target_area) { - best_ratio = ratio; - } - } - } - return best_ratio; - } - - static std::vector get_target_ratios() { - std::vector ratios; - for (int n = min_tiles; n <= max_tiles; n++) { - for (int w = 1; w <= n; w++) { - for (int h = 1; h <= n; h++) { - if (w * h >= min_tiles && w * h <= max_tiles) { - bool found = false; - for (const auto & r : ratios) { - if (r.width == w && r.height == h) { - found = true; - break; - } - } - if (!found) { - ratios.push_back({w, h}); - } - } - } - } - } - std::sort(ratios.begin(), ratios.end(), [](const clip_image_size & a, const clip_image_size & b) { - return a.width * a.height < b.width * b.height; - }); - return ratios; - } - - static clip_image_size get_grid_layout(int height, int width) { - const float aspect_ratio = static_cast(width) / height; - const auto ratios = get_target_ratios(); - return find_closest_aspect_ratio(aspect_ratio, ratios, width, height); - } -}; - -// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector -// res_imgs memory is being allocated here, previous allocations will be freed if found -bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) { - clip_image_size original_size{img->nx, img->ny}; - auto & params = ctx->model.hparams; - - switch (ctx->proj_type()) { - case PROJECTOR_TYPE_MINICPMV: - { - auto const inst = llava_uhd::get_slice_instructions(ctx, original_size); - std::vector imgs = llava_uhd::slice_image(img, inst); - - for (size_t i = 0; i < imgs.size(); ++i) { - // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp"); - clip_image_f32_ptr res(clip_image_f32_init()); - normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); - res_imgs->entries.push_back(std::move(res)); - } - - res_imgs->grid_x = inst.grid_size.width; - res_imgs->grid_y = inst.grid_size.height; - } break; - - case PROJECTOR_TYPE_QWEN2VL: - case PROJECTOR_TYPE_QWEN25VL: - case PROJECTOR_TYPE_QWEN3VL: - case PROJECTOR_TYPE_GLM4V: - case PROJECTOR_TYPE_PADDLEOCR: - { - GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0); - clip_image_u8 resized; - const clip_image_size new_size = img_tool::calc_size_preserved_ratio( - original_size, - params.patch_size * 2, - params.image_min_pixels, - params.image_max_pixels); - img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false); - // clip_image_save_to_bmp(resized, "preproc.bmp"); - clip_image_f32_ptr img_f32(clip_image_f32_init()); - // clip_image_f32_ptr res(clip_image_f32_init()); - normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std); - // res_imgs->data[0] = *res; - res_imgs->entries.push_back(std::move(img_f32)); - } break; - case PROJECTOR_TYPE_YOUTUVL: - { - const int patch_size = params.patch_size; // typically 16 - const int merge_size = params.n_merge; // typically 2 - const int align_size = patch_size * merge_size; // 32 - - const int max_num_patches = params.image_max_pixels > 0 ? - params.image_max_pixels / (patch_size * patch_size) : 256; - - // Linear search for optimal scale to fit within max_num_patches - float scale = 1.0f; - int target_height = original_size.height; - int target_width = original_size.width; - - auto get_scaled_image_size = [align_size](float scale, int size) -> int { - float scaled_size = size * scale; - // Round up to nearest multiple of align_size - int aligned = static_cast(std::ceil(scaled_size / align_size)) * align_size; - // Ensure at least one patch - return std::max(align_size, aligned); - }; - - // Linear search with 0.02 step size - while (scale > 0.0f) { - target_height = get_scaled_image_size(scale, original_size.height); - target_width = get_scaled_image_size(scale, original_size.width); - - int num_patches_h = target_height / patch_size; - int num_patches_w = target_width / patch_size; - int num_patches = num_patches_h * num_patches_w; - - if (num_patches > max_num_patches) { - scale -= 0.02f; - } else { - break; - } - } - - clip_image_size new_size = {target_width, target_height}; - - // Resize the image - clip_image_u8 resized; - img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false); - - // Normalize to float32 - clip_image_f32_ptr img_f32(clip_image_f32_init()); - normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std); - - // Add to results - res_imgs->entries.push_back(std::move(img_f32)); - } break; - - case PROJECTOR_TYPE_IDEFICS3: - { - // The refined size has two steps: - // 1. Resize w/ aspect-ratio preserving such that the longer side is - // the preprocessor longest size - // 2. Resize w/out preserving aspect ratio such that both sides are - // multiples of image_size (always rounding up) - // - // CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737 - const clip_image_size refined_size = img_tool::calc_size_preserved_ratio( - original_size, params.image_size, params.image_longest_edge); - // LOG_INF("%s: original size: %d x %d, refined size: %d x %d\n", - // __func__, original_size.width, original_size.height, - // refined_size.width, refined_size.height); - - llava_uhd::slice_instructions instructions; - instructions.overview_size = clip_image_size{params.image_size, params.image_size}; - instructions.refined_size = refined_size; - instructions.grid_size = clip_image_size{ - static_cast(std::ceil(static_cast(refined_size.width) / params.image_size)), - static_cast(std::ceil(static_cast(refined_size.height) / params.image_size)), - }; - for (int y = 0; y < refined_size.height; y += params.image_size) { - for (int x = 0; x < refined_size.width; x += params.image_size) { - // LOG_INF("%s: adding slice at x=%d, y=%d\n", __func__, x, y); - instructions.slices.push_back(llava_uhd::slice_coordinates{ - /* x */x, - /* y */y, - /* size */clip_image_size{ - std::min(params.image_size, refined_size.width - x), - std::min(params.image_size, refined_size.height - y) - } - }); - } - } - auto imgs = llava_uhd::slice_image(img, instructions); - - // cast and normalize to f32 - for (size_t i = 0; i < imgs.size(); ++i) { - // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp"); - clip_image_f32_ptr res(clip_image_f32_init()); - normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); - res_imgs->entries.push_back(std::move(res)); - } - - res_imgs->grid_x = instructions.grid_size.width; - res_imgs->grid_y = instructions.grid_size.height; - } break; - case PROJECTOR_TYPE_INTERNVL: // support dynamic high-resolution - { - GGML_ASSERT(!params.image_res_candidates.empty()); - auto const inst = llava_uhd::get_slice_instructions(ctx, original_size); - std::vector imgs = llava_uhd::slice_image(img, inst, false); - - for (size_t i = 0; i < imgs.size(); ++i) { - clip_image_f32_ptr res(clip_image_f32_init()); - normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); - res_imgs->entries.push_back(std::move(res)); - } - } break; - case PROJECTOR_TYPE_GLM_EDGE: - case PROJECTOR_TYPE_GEMMA3: - case PROJECTOR_TYPE_NEMOTRON_V2_VL: - { - clip_image_u8 resized_image; - int sz = params.image_size; - img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR); - clip_image_f32_ptr img_f32(clip_image_f32_init()); - //clip_image_save_to_bmp(resized_image, "resized.bmp"); - normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std); - res_imgs->entries.push_back(std::move(img_f32)); - } break; - - case PROJECTOR_TYPE_GEMMA3NV: - { - clip_image_u8 resized_image; - int sz = params.image_size; - img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, false); - clip_image_f32_ptr img_f32(clip_image_f32_init()); - normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std); - res_imgs->entries.push_back(std::move(img_f32)); - } break; - - case PROJECTOR_TYPE_JANUS_PRO: - { - // Janus Pro preprocessing: pad to square with gray(127), resize to 384x384 - const std::array pad_color = {127, 127, 127}; - clip_image_u8 resized_image; - int sz = params.image_size; - img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color); - clip_image_f32_ptr img_f32(clip_image_f32_init()); - normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std); - res_imgs->entries.push_back(std::move(img_f32)); - } break; - - case PROJECTOR_TYPE_PHI4: - case PROJECTOR_TYPE_PIXTRAL: - { - GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0); - clip_image_u8 resized_image; - // the original pixtral model doesn't have n_merge - const int cur_merge = params.n_merge == 0 ? 1 : params.n_merge; - const clip_image_size target_size = img_tool::calc_size_preserved_ratio( - original_size, - params.patch_size * cur_merge, - params.image_min_pixels, - params.image_max_pixels); - img_tool::resize(*img, resized_image, target_size, img_tool::RESIZE_ALGO_BILINEAR); - clip_image_f32_ptr img_f32(clip_image_f32_init()); - normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std); - res_imgs->entries.push_back(std::move(img_f32)); - } break; - case PROJECTOR_TYPE_LIGHTONOCR: - { - GGML_ASSERT(params.image_longest_edge > 0); - clip_image_u8 resized_image; - const clip_image_size target_size = img_tool::calc_size_preserved_ratio( - original_size, - params.patch_size * params.n_merge, - params.image_longest_edge); - img_tool::resize(*img, resized_image, target_size, img_tool::RESIZE_ALGO_BICUBIC); - clip_image_f32_ptr img_f32(clip_image_f32_init()); - normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std); - res_imgs->entries.push_back(std::move(img_f32)); - } break; - - case PROJECTOR_TYPE_LLAMA4: - { - GGML_ASSERT(!params.image_res_candidates.empty()); - auto const inst = llava_uhd::get_slice_instructions(ctx, original_size); - std::vector imgs = llava_uhd::slice_image(img, inst); - - for (size_t i = 0; i < imgs.size(); ++i) { - clip_image_f32_ptr res(clip_image_f32_init()); - normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); - res_imgs->entries.push_back(std::move(res)); - } - - res_imgs->grid_x = inst.grid_size.width; - res_imgs->grid_y = inst.grid_size.height; - } break; - - case PROJECTOR_TYPE_LFM2: - { - auto const inst = lfm2_vl_image_processor::get_slice_instructions(ctx, original_size); - std::vector imgs = llava_uhd::slice_image(img, inst); - - for (size_t i = 0; i < imgs.size(); ++i) { - clip_image_f32_ptr res(clip_image_f32_init()); - normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); - res_imgs->entries.push_back(std::move(res)); - } - - res_imgs->grid_x = inst.grid_size.width; - res_imgs->grid_y = inst.grid_size.height; - } break; - - case PROJECTOR_TYPE_KIMIVL: - { - GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0); - const clip_image_size target_size = img_tool::calc_size_preserved_ratio( - original_size, - params.patch_size * params.n_merge, - params.image_min_pixels, - params.image_max_pixels); - const std::array pad_color = {122, 116, 104}; - - clip_image_u8 resized_img; - img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color); - clip_image_f32_ptr res(clip_image_f32_init()); - normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std); - res_imgs->entries.push_back(std::move(res)); - } break; - - case PROJECTOR_TYPE_KIMIK25: - { - GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0); - const clip_image_size target_size = img_tool::calc_size_preserved_ratio( - original_size, - params.patch_size * params.n_merge, - params.image_min_pixels, - params.image_max_pixels); - const std::array pad_color = {0, 0, 0}; - - clip_image_u8 resized_img; - img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BICUBIC, true, pad_color); - clip_image_f32_ptr res(clip_image_f32_init()); - normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std); - res_imgs->entries.push_back(std::move(res)); - } break; - - case PROJECTOR_TYPE_MLP: - case PROJECTOR_TYPE_MLP_NORM: - case PROJECTOR_TYPE_LDP: - case PROJECTOR_TYPE_LDPV2: - case PROJECTOR_TYPE_COGVLM: // TODO @ngxson : is this correct for cogvlm? - { - // TODO @ngxson : refactor the code below to avoid duplicated logic - - // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) - // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 - - clip_image_u8_ptr temp(clip_image_u8_init()); // we will keep the input image data here temporarily - - // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing - if (params.image_res_candidates.empty()) { // pad_to_square - // for llava-1.5, we resize image to a square, and pad the shorter side with a background color - // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 - const int longer_side = std::max(img->nx, img->ny); - temp->nx = longer_side; - temp->ny = longer_side; - temp->buf.resize(3 * longer_side * longer_side); - - // background color in RGB from LLaVA (this is the mean rgb color * 255) - const std::array pad_color = {122, 116, 104}; - - // resize the image to the target_size - img_tool::resize(*img, *temp, clip_image_size{params.image_size, params.image_size}, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color); - - clip_image_f32_ptr res(clip_image_f32_init()); - normalize_image_u8_to_f32(*temp, *res, params.image_mean, params.image_std); - res_imgs->entries.push_back(std::move(res)); - - } else { - // "spatial_unpad" with "anyres" processing for llava-1.6 - auto const inst = llava_uhd::get_slice_instructions(ctx, original_size); - std::vector imgs = llava_uhd::slice_image(img, inst); - - for (size_t i = 0; i < imgs.size(); ++i) { - // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp"); - clip_image_f32_ptr res(clip_image_f32_init()); - normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); - res_imgs->entries.push_back(std::move(res)); - } - } - } break; - case PROJECTOR_TYPE_DEEPSEEKOCR: - { - const std::vector native_resolutions = { - /*512 tiny , 640 small, */ 1024 /* base */, 1280 /* large */ - }; - // original image size - const int orig_w = original_size.width; - const int orig_h = original_size.height; - const int orig_area = orig_h * orig_w; - std::array color; - - for (int i = 0; i < 3; i++) { - color[i] = static_cast(params.image_mean[i] * 255.0f); - } - - size_t mode_i = 0; - int min_diff = orig_area; - - for (size_t i = 0; i < native_resolutions.size(); i++) { - int r = native_resolutions[i]; - if (std::abs(orig_area - r * r) < min_diff) { - mode_i = i; - min_diff = std::abs(orig_area - r * r); - } - } - - /* Native Resolution (Base/Large) */ - const int image_size = native_resolutions[mode_i]; - - // Resize maintaining an aspect ratio, then pad to square - float scale = std::min( - static_cast(image_size) / orig_w, - static_cast(image_size) / orig_h - ); - int new_w = static_cast(orig_w * scale); - int new_h = static_cast(orig_h * scale); - - clip_image_u8_ptr scaled_img(clip_image_u8_init()); - img_tool::resize(*img, *scaled_img, clip_image_size{new_w, new_h}, - img_tool::RESIZE_ALGO_BICUBIC_PILLOW, true, color); - - // Use mean color for padding - unsigned char pad_r = static_cast(params.image_mean[0] * 255.0f); - unsigned char pad_g = static_cast(params.image_mean[1] * 255.0f); - unsigned char pad_b = static_cast(params.image_mean[2] * 255.0f); - - // Pad to image_size × image_size (center padding) - clip_image_u8_ptr padded_img(clip_image_u8_init()); - padded_img->nx = image_size; - padded_img->ny = image_size; - padded_img->buf.resize(image_size * image_size * 3); // black padding - - // Fill with mean color - for (int i = 0; i < image_size * image_size; ++i) - { - padded_img->buf[i * 3 + 0] = pad_r; - padded_img->buf[i * 3 + 1] = pad_g; - padded_img->buf[i * 3 + 2] = pad_b; - } - - // Calculate padding offsets (center the image) - int pad_x = (image_size - new_w) / 2; - int pad_y = (image_size - new_h) / 2; - - // Copy scaled image into padded canvas - for (int y = 0; y < new_h; ++y){ - for (int x = 0; x < new_w; ++x){ - int src_idx = (y * new_w + x) * 3; - int dst_idx = ((y + pad_y) * image_size + (x + pad_x)) * 3; - padded_img->buf[dst_idx + 0] = scaled_img->buf[src_idx + 0]; - padded_img->buf[dst_idx + 1] = scaled_img->buf[src_idx + 1]; - padded_img->buf[dst_idx + 2] = scaled_img->buf[src_idx + 2]; - } - } - - // Normalize and output - clip_image_f32_ptr res(clip_image_f32_init()); - normalize_image_u8_to_f32(*padded_img, *res, params.image_mean, params.image_std); - res_imgs->entries.push_back(std::move(res)); - - res_imgs->grid_x = 1; - res_imgs->grid_y = 1; - } break; - - default: - LOG_ERR("%s: unsupported projector type %d\n", __func__, ctx->proj_type()); - return false; - } - - return true; -} - ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) { return ctx->model.image_newline; } diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index 71b58484d6..a859b38658 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -97,9 +97,6 @@ struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch */ void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img); -/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */ -bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs ); - struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx); bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec); diff --git a/tools/mtmd/mtmd-image.cpp b/tools/mtmd/mtmd-image.cpp new file mode 100644 index 0000000000..b446437192 --- /dev/null +++ b/tools/mtmd/mtmd-image.cpp @@ -0,0 +1,1166 @@ +#include "mtmd-image.h" + +#include +#include +#include + +// +// base implementation +// + +void mtmd_image_preprocessor::img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) { + dst.nx = src.nx; + dst.ny = src.ny; + dst.buf.resize(src.buf.size()); + + // TODO @ngxson : seems like this could be done more efficiently on cgraph + for (size_t i = 0; i < src.buf.size(); ++i) { + int c = i % 3; // rgb + dst.buf[i] = (static_cast(src.buf[i]) / 255.0f - mean[c]) / std[c]; + } +} + +void mtmd_image_preprocessor::img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst) { + dst.nx = src.nx; + dst.ny = src.ny; + dst.buf.resize(src.buf.size()); + + for (size_t i = 0; i < src.buf.size(); ++i) { + dst.buf[i] = static_cast(src.buf[i]); + } +} + +// set of tools to manipulate images +// in the future, we can have HW acceleration by allowing this struct to access 3rd party lib like imagick or opencv +struct img_tool { + static void resize( + const clip_image_u8 & src, + clip_image_u8 & dst, + const clip_image_size & target_resolution, + resize_algo algo, + bool add_padding = true, // TODO: define the behavior for add_padding = false + std::array pad_color = {0, 0, 0}) { + dst.nx = target_resolution.width; + dst.ny = target_resolution.height; + dst.buf.resize(3 * dst.nx * dst.ny); + + if (dst.nx == src.nx && dst.ny == src.ny) { + // no resize needed, simple copy + dst.buf = src.buf; + return; + } + + if (!add_padding) { + // direct resize + switch (algo) { + case RESIZE_ALGO_BILINEAR: + resize_bilinear(src, dst, target_resolution.width, target_resolution.height); + break; + case RESIZE_ALGO_BICUBIC: + resize_bicubic(src, dst, target_resolution.width, target_resolution.height); + break; + case RESIZE_ALGO_BICUBIC_PILLOW: + resize_bicubic_pillow(src, dst, target_resolution.width, target_resolution.height); + break; + default: + throw std::runtime_error("Unsupported resize algorithm"); + } + } else { + // resize with padding + clip_image_u8 resized_image; + float scale_w = static_cast(target_resolution.width) / src.nx; + float scale_h = static_cast(target_resolution.height) / src.ny; + float scale = std::min(scale_w, scale_h); + int new_width = std::min(static_cast(std::ceil(src.nx * scale)), target_resolution.width); + int new_height = std::min(static_cast(std::ceil(src.ny * scale)), target_resolution.height); + + switch (algo) { + case RESIZE_ALGO_BILINEAR: + resize_bilinear(src, resized_image, new_width, new_height); + break; + case RESIZE_ALGO_BICUBIC: + resize_bicubic(src, resized_image, new_width, new_height); + break; + case RESIZE_ALGO_BICUBIC_PILLOW: + resize_bicubic_pillow(src, resized_image, new_width, new_height); + break; + default: + throw std::runtime_error("Unsupported resize algorithm"); + } + + // fill dst with pad_color + fill(dst, pad_color); + + int offset_x = (target_resolution.width - new_width) / 2; + int offset_y = (target_resolution.height - new_height) / 2; + + composite(dst, resized_image, offset_x, offset_y); + } + } + + static void crop(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) { + dst.nx = w; + dst.ny = h; + dst.buf.resize(3 * w * h); + + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int src_idx = 3 * ((y + i)*image.nx + (x + j)); + int dst_idx = 3 * (i*w + j); + dst.buf[dst_idx] = image.buf[src_idx]; + dst.buf[dst_idx + 1] = image.buf[src_idx + 1]; + dst.buf[dst_idx + 2] = image.buf[src_idx + 2]; + } + } + } + + // calculate the size of the **resized** image, while preserving the aspect ratio + // the calculated size will be aligned to the nearest multiple of align_size + // if H or W size is larger than longest_edge, it will be resized to longest_edge + static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int longest_edge) { + GGML_ASSERT(align_size > 0); + if (inp_size.width <= 0 || inp_size.height <= 0 || longest_edge <= 0) { + return {0, 0}; + } + + float scale = std::min(static_cast(longest_edge) / inp_size.width, + static_cast(longest_edge) / inp_size.height); + + float target_width_f = static_cast(inp_size.width) * scale; + float target_height_f = static_cast(inp_size.height) * scale; + + auto ceil_by_factor = [f = align_size](float x) { return static_cast(std::ceil(x / static_cast(f))) * f; }; + int aligned_width = ceil_by_factor(target_width_f); + int aligned_height = ceil_by_factor(target_height_f); + + return {aligned_width, aligned_height}; + } + + // calculate the size of the **resized** image, while preserving the aspect ratio + // the calculated size will have min_pixels <= W*H <= max_pixels + // this is referred as "smart_resize" in transformers code + static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int min_pixels, const int max_pixels) { + GGML_ASSERT(align_size > 0); + const int width = inp_size.width; + const int height = inp_size.height; + + auto round_by_factor = [f = align_size](float x) { return static_cast(std::round(x / static_cast(f))) * f; }; + auto ceil_by_factor = [f = align_size](float x) { return static_cast(std::ceil(x / static_cast(f))) * f; }; + auto floor_by_factor = [f = align_size](float x) { return static_cast(std::floor(x / static_cast(f))) * f; }; + + // always align up first + int h_bar = std::max(align_size, round_by_factor(height)); + int w_bar = std::max(align_size, round_by_factor(width)); + + if (h_bar * w_bar > max_pixels) { + const auto beta = std::sqrt(static_cast(height * width) / max_pixels); + h_bar = std::max(align_size, floor_by_factor(height / beta)); + w_bar = std::max(align_size, floor_by_factor(width / beta)); + } else if (h_bar * w_bar < min_pixels) { + const auto beta = std::sqrt(static_cast(min_pixels) / (height * width)); + h_bar = ceil_by_factor(height * beta); + w_bar = ceil_by_factor(width * beta); + } + + return {w_bar, h_bar}; + } + + // draw src image into dst image at offset (offset_x, offset_y) + static void composite(clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) { + for (int y = 0; y < src.ny; ++y) { + for (int x = 0; x < src.nx; ++x) { + int dx = x + offset_x; + int dy = y + offset_y; + // skip pixels that would be out of bounds in the destination + if (dx < 0 || dy < 0 || dx >= dst.nx || dy >= dst.ny) { + continue; + } + size_t dst_idx = 3 * (static_cast(dy) * dst.nx + static_cast(dx)); + size_t src_idx = 3 * (static_cast(y) * src.nx + static_cast(x)); + dst.buf[dst_idx + 0] = src.buf[src_idx + 0]; + dst.buf[dst_idx + 1] = src.buf[src_idx + 1]; + dst.buf[dst_idx + 2] = src.buf[src_idx + 2]; + } + } + } + + // fill the image with a solid color + static void fill(clip_image_u8 & img, const std::array & color) { + for (size_t i = 0; i < img.buf.size(); i += 3) { + img.buf[i] = color[0]; + img.buf[i + 1] = color[1]; + img.buf[i + 2] = color[2]; + } + } + +private: + // Bilinear resize function + static void resize_bilinear(const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) { + dst.nx = target_width; + dst.ny = target_height; + dst.buf.resize(3 * target_width * target_height); + + float x_ratio = static_cast(src.nx - 1) / target_width; + float y_ratio = static_cast(src.ny - 1) / target_height; + + for (int y = 0; y < target_height; y++) { + for (int x = 0; x < target_width; x++) { + float px = x_ratio * x; + float py = y_ratio * y; + int x_floor = static_cast(px); + int y_floor = static_cast(py); + float x_lerp = px - x_floor; + float y_lerp = py - y_floor; + + for (int c = 0; c < 3; c++) { + float top = lerp( + static_cast(src.buf[3 * (y_floor * src.nx + x_floor) + c]), + static_cast(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]), + x_lerp + ); + float bottom = lerp( + static_cast(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]), + static_cast(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]), + x_lerp + ); + dst.buf[3 * (y * target_width + x) + c] = static_cast(lerp(top, bottom, y_lerp)); + } + } + } + } + + // Bicubic resize function + // part of image will be cropped if the aspect ratio is different + static bool resize_bicubic(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) { + const int nx = img.nx; + const int ny = img.ny; + + dst.nx = target_width; + dst.ny = target_height; + dst.buf.resize(3 * target_width * target_height); + + float Cc; + float C[5] = {}; + float d0, d2, d3, a0, a1, a2, a3; + int i, j, k, jj; + int x, y; + float dx, dy; + float tx, ty; + + tx = (float)nx / (float)target_width; + ty = (float)ny / (float)target_height; + + // Bicubic interpolation; adapted from ViT.cpp, inspired from : + // -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36 + // -> https://en.wikipedia.org/wiki/Bicubic_interpolation + + for (i = 0; i < target_height; i++) { + for (j = 0; j < target_width; j++) { + x = (int)(tx * j); + y = (int)(ty * i); + + dx = tx * j - x; + dy = ty * i - y; + + for (k = 0; k < 3; k++) { + for (jj = 0; jj <= 3; jj++) { + d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + + a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; + a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; + a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; + + C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx; + + d0 = C[0] - C[1]; + d2 = C[2] - C[1]; + d3 = C[3] - C[1]; + a0 = C[1]; + a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; + a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; + a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; + Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy; + + const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f); + dst.buf[(i * target_width + j) * 3 + k] = float(Cc2); + } + } + } + } + + return true; + } + + // Bicubic resize function using Pillow's ImagingResample algorithm + // Adapted from https://github.com/python-pillow/Pillow/blob/main/src/libImaging/Resample.c + // + // Key Difference with resize_bicubic: + // 1. Uses separable filtering: horizontal pass followed by vertical pass + // 2. Pre-computes normalized filter coefficients for each output pixel + // 3. Applies convolution using fixed-point integer arithmetic for performance + static bool resize_bicubic_pillow(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) { + // Fixed-point precision: 22 bits = 32 (int32_t) - 8 (uint8_t pixels) - 2 (headroom for accumulation) + // This allows encoding fractional weights as integers: weight * 2^22 + const int PRECISION_BITS = 32 - 8 - 2; + + // Bicubic filter function with a = -0.5 (Note that GGML/PyTorch takes a = -0.75) + // Returns filter weight for distance x from pixel center + // Support: [-2, 2], meaning the filter influences pixels within 2 units of distance + auto bicubic_filter = [](double x) -> double { + constexpr double a = -0.5; + if (x < 0.0) { + x = -x; + } + if (x < 1.0) { + return ((a + 2.0) * x - (a + 3.0)) * x * x + 1; + } + if (x < 2.0) { + return (((x - 5) * x + 8) * x - 4) * a; + } + return 0.0; // Zero outside [-2, 2] + }; + + // Filter support radius: bicubic extends 2 pixels in each direction + constexpr double filter_support = 2.0; + + // Clipping function for 8-bit values + auto clip8 = [](int val) -> uint8_t { + if (val < 0) return 0; + if (val > 255) return 255; + return static_cast(val); + }; + + // Precompute filter coefficients for ONE dimension (horizontal or vertical) + // + // Parameters: + // inSize - Number of pixels in input dimension (e.g., src_width or src_height) + // outSize - Number of pixels in output dimension (e.g., target_width or target_height) + // bounds - [OUTPUT] Array of size outSize*2 storing input pixel ranges: + // bounds[xx*2+0] = first input pixel index for output pixel xx (xmin) + // bounds[xx*2+1] = number of input pixels for output pixel xx (xcnt) + // weights - [OUTPUT] Array of size outSize*ksize storing fixed-point filter weights: + // kk[xx*ksize + x] = weight for input pixel x contributing to output pixel xx + // + // Returns: kernel size (ksize) - number of input pixels that contribute to each output pixel + auto precompute_weights = [&](int inSize, int outSize, + std::vector & bounds, std::vector & weights) -> int { + double support, scale, filterscale; + double center, ww, ss; + int xx, x, ksize, xmin, xmax, xcnt; + + // Calculate scaling factor: ratio of input range to output size + filterscale = scale = (double)inSize / outSize; + // For upsampling (scale < 1), keep filterscale = 1 to maintain filter sharpness + // For downsampling (scale > 1), widen filter to prevent aliasing + if (filterscale < 1.0) { + filterscale = 1.0; + } + + // Determine filter support radius and kernel size + support = filter_support * filterscale; // Widen filter when downsampling + ksize = static_cast(std::ceil(support)) * 2 + 1; // Total pixels in kernel + + std::vector pre_weights(outSize * ksize); // Temporary weights + bounds.resize(outSize * 2); + + // For each output pixel, compute its filter coefficients + for (xx = 0; xx < outSize; xx++) { + // Calculate the center position in input space (pixel-center convention: +0.5) + center = (xx + 0.5) * scale; + ww = 0.0; // Sum of weights for normalization + ss = 1.0 / filterscale; // Scale factor for filter function + + // Determine the range of input pixels that contribute to this output pixel + xmin = static_cast(center - support + 0.5); + if (xmin < 0) { + xmin = 0; + } + + xmax = static_cast(center + support + 0.5); + if (xmax > inSize) { + xmax = inSize; + } + + xcnt = xmax - xmin; + + // Compute filter weights for each contributing input pixel + for (x = 0; x < xcnt; x++) { + // Distance from input pixel center to output pixel center in input space + double w = bicubic_filter((x + xmin - center + 0.5) * ss); + pre_weights[xx * ksize + x] = w; + ww += w; // Accumulate for normalization + } + + // Normalize weights to sum to 1.0 (preserves brightness) + for (x = 0; x < xcnt; x++) { + if (ww != 0.0) { + pre_weights[xx * ksize + x] /= ww; + } + } + + // Zero-pad remaining kernel positions + for (; x < ksize; x++) { + pre_weights[xx * ksize + x] = 0; + } + + // Store input pixel range for this output pixel + bounds[xx * 2 + 0] = xmin; + bounds[xx * 2 + 1] = xcnt; + } + + // Convert floating-point coefficients to fixed-point integers + // Formula: int32 = round(float * 2^PRECISION_BITS) + weights.resize(outSize * ksize); + for (int i = 0; i < outSize * ksize; i++) { + if (pre_weights[i] < 0) { + weights[i] = static_cast(-0.5 + pre_weights[i] * (1 << PRECISION_BITS)); + } else { + weights[i] = static_cast(0.5 + pre_weights[i] * (1 << PRECISION_BITS)); + } + } + + return ksize; + }; + + // Horizontal resampling pass + // Resizes width from imIn.nx to imOut.nx, preserving height + auto resample_horizontal = [&](const clip_image_u8 & imIn, clip_image_u8 & imOut, + int ksize, const std::vector & bounds, const std::vector & weights) { + imOut.ny = imIn.ny; + imOut.buf.resize(3 * imOut.nx * imOut.ny); + + // Process each row independently + for (int yy = 0; yy < imOut.ny; yy++) { + // For each output pixel in this row + for (int xx = 0; xx < imOut.nx; xx++) { + // Get the range of input pixels and filter coefficients + int xmin = bounds[xx * 2 + 0]; // First input pixel index + int xcnt = bounds[xx * 2 + 1]; // Number of input pixels + + // Initialize accumulators for RGB channels with rounding bias (0.5 in fixed-point) + int32_t ss0 = 1 << (PRECISION_BITS - 1); + int32_t ss1 = 1 << (PRECISION_BITS - 1); + int32_t ss2 = 1 << (PRECISION_BITS - 1); + + // Convolve: sum weighted input pixels + for (int x = 0; x < xcnt; x++) { + int src_idx = ((yy * imIn.nx) + (x + xmin)) * 3; + ss0 += static_cast(imIn.buf[src_idx + 0]) * weights[xx * ksize + x]; // R channel + ss1 += static_cast(imIn.buf[src_idx + 1]) * weights[xx * ksize + x]; // G channel + ss2 += static_cast(imIn.buf[src_idx + 2]) * weights[xx * ksize + x]; // B channel + } + + // Convert back from fixed-point (divide by 2^PRECISION_BITS) and clamp to [0,255] + int dst_idx = (yy * imOut.nx + xx) * 3; + imOut.buf[dst_idx + 0] = clip8(ss0 >> PRECISION_BITS); + imOut.buf[dst_idx + 1] = clip8(ss1 >> PRECISION_BITS); + imOut.buf[dst_idx + 2] = clip8(ss2 >> PRECISION_BITS); + } + } + }; + + // Vertical resampling pass + // Resizes height from imIn.ny to imOut.ny, preserving width + auto resample_vertical = [&](const clip_image_u8 & imIn, clip_image_u8 & imOut, + int ksize, const std::vector & bounds, const std::vector & weight) { + imOut.nx = imIn.nx; + imOut.buf.resize(3 * imOut.nx * imOut.ny); + + // For each output row + for (int yy = 0; yy < imOut.ny; yy++) { + // Get the range of input rows and filter coefficients + int ymin = bounds[yy * 2 + 0]; // First input row index + int ycnt = bounds[yy * 2 + 1]; // Number of input rows + + // Process each column in this output row + for (int xx = 0; xx < imOut.nx; xx++) { + // Initialize accumulators for RGB channels with rounding bias + int32_t ss0 = 1 << (PRECISION_BITS - 1); + int32_t ss1 = 1 << (PRECISION_BITS - 1); + int32_t ss2 = 1 << (PRECISION_BITS - 1); + + // Convolve: sum weighted input pixels vertically + for (int y = 0; y < ycnt; y++) { + int src_idx = ((y + ymin) * imIn.nx + xx) * 3; + ss0 += static_cast(imIn.buf[src_idx + 0]) * weight[yy * ksize + y]; // R channel + ss1 += static_cast(imIn.buf[src_idx + 1]) * weight[yy * ksize + y]; // G channel + ss2 += static_cast(imIn.buf[src_idx + 2]) * weight[yy * ksize + y]; // B channel + } + + // Convert back from fixed-point and clamp to [0,255] + int dst_idx = (yy * imOut.nx + xx) * 3; + imOut.buf[dst_idx + 0] = clip8(ss0 >> PRECISION_BITS); + imOut.buf[dst_idx + 1] = clip8(ss1 >> PRECISION_BITS); + imOut.buf[dst_idx + 2] = clip8(ss2 >> PRECISION_BITS); + } + } + }; + + // Main resampling logic using separable two-pass approach + const int src_width = img.nx; + const int src_height = img.ny; + + dst.nx = target_width; + dst.ny = target_height; + + bool need_horizontal = (target_width != src_width); + bool need_vertical = (target_height != src_height); + + // Precompute filter coefficients for both dimensions + std::vector bounds_horiz, bounds_vert; + std::vector weights_horiz, weights_vert; + int ksize_horiz = 0, ksize_vert = 0; + + if (need_horizontal) { + ksize_horiz = precompute_weights(src_width, target_width, bounds_horiz, weights_horiz); + } + + if (need_vertical) { + ksize_vert = precompute_weights(src_height, target_height, bounds_vert, weights_vert); + } + + // Perform two-pass resampling + if (need_horizontal && need_vertical) { + // Both horizontal and vertical + clip_image_u8 temp; + temp.nx = target_width; + resample_horizontal(img, temp, ksize_horiz, bounds_horiz, weights_horiz); + resample_vertical(temp, dst, ksize_vert, bounds_vert, weights_vert); + } else if (need_horizontal) { + // Only horizontal + resample_horizontal(img, dst, ksize_horiz, bounds_horiz, weights_horiz); + } else if (need_vertical) { + // Only vertical + resample_vertical(img, dst, ksize_vert, bounds_vert, weights_vert); + } else { + // No resizing needed - direct copy + dst.buf = img.buf; + } + + return true; + } + + static inline int clip(int x, int lower, int upper) { + return std::max(lower, std::min(x, upper)); + } + + // Linear interpolation between two points + static inline float lerp(float s, float e, float t) { + return s + (e - s) * t; + } +}; + + +// +// mtmd_image_preprocessor_llava_uhd +// + +bool mtmd_image_preprocessor_llava_uhd::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { + const clip_image_size original_size{img.nx, img.ny}; + auto const inst = get_slice_instructions(original_size); + std::vector imgs = slice_image(img, inst); + + for (size_t i = 0; i < imgs.size(); ++i) { + // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp"); + clip_image_f32_ptr res(clip_image_f32_init()); + img_u8_to_f32(*imgs[i], *res, hparams.image_mean, hparams.image_std); + output.entries.push_back(std::move(res)); + } + + output.grid_x = inst.grid_size.width; + output.grid_y = inst.grid_size.height; + return true; +} + +mtmd_image_preprocessor_llava_uhd::slice_instructions mtmd_image_preprocessor_llava_uhd::get_slice_instructions(const clip_image_size & original_size) { + mtmd_image_preprocessor_llava_uhd::slice_instructions res; + const int patch_size = hparams.patch_size; + const int slice_size = hparams.image_size; + const int original_width = original_size.width; + const int original_height = original_size.height; + + const bool has_slices = original_size.width > slice_size || original_size.height > slice_size; + const bool has_pinpoints = !hparams.image_res_candidates.empty(); + + if (!has_slices) { + // skip slicing logic + res.overview_size = clip_image_size{slice_size, slice_size}; + res.refined_size = clip_image_size{0, 0}; + res.grid_size = clip_image_size{0, 0}; + + return res; + } + + if (has_pinpoints) { + // has pinpoints, use them to calculate the grid size (e.g. llava-1.6) + auto refine_size = select_best_resolution( + original_size, + hparams.image_res_candidates); + res.overview_size = clip_image_size{slice_size, slice_size}; + res.refined_size = refine_size; + res.grid_size = clip_image_size{0, 0}; + + LOG_DBG("%s: using pinpoints for slicing\n", __func__); + LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d\n", + __func__, original_width, original_height, + res.overview_size.width, res.overview_size.height, + res.refined_size.width, res.refined_size.height); + + for (int y = 0; y < refine_size.height; y += slice_size) { + for (int x = 0; x < refine_size.width; x += slice_size) { + slice_coordinates slice; + slice.x = x; + slice.y = y; + slice.size.width = std::min(slice_size, refine_size.width - x); + slice.size.height = std::min(slice_size, refine_size.height - y); + res.slices.push_back(slice); + LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n", + __func__, (int)res.slices.size() - 1, + slice.x, slice.y, slice.size.width, slice.size.height); + } + } + + res.grid_size.height = refine_size.height / slice_size; + res.grid_size.width = refine_size.width / slice_size; + LOG_DBG("%s: grid size: %d x %d\n", __func__, res.grid_size.width, res.grid_size.height); + + return res; + } + + // no pinpoints, dynamically calculate the grid size (e.g. minicpmv) + + auto best_size = get_best_resize(original_size, slice_size, patch_size, !has_slices); + res.overview_size = best_size; + + { + const int max_slice_nums = 9; // TODO: this is only used by minicpmv, maybe remove it + const float log_ratio = log((float)original_width / original_height); + const float ratio = (float)original_width * original_height / (slice_size * slice_size); + const int multiple = fmin(ceil(ratio), max_slice_nums); + + auto best_grid = get_best_grid(max_slice_nums, multiple, log_ratio); + auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true); + res.grid_size = best_grid; + res.refined_size = refine_size; + + LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n", + __func__, original_width, original_height, + res.overview_size.width, res.overview_size.height, + res.refined_size.width, res.refined_size.height, + res.grid_size.width, res.grid_size.height); + + int width = refine_size.width; + int height = refine_size.height; + int grid_x = int(width / best_grid.width); + int grid_y = int(height / best_grid.height); + for (int patches_y = 0, ic = 0; + patches_y < refine_size.height && ic < best_grid.height; + patches_y += grid_y, ic += 1) { + for (int patches_x = 0, jc = 0; + patches_x < refine_size.width && jc < best_grid.width; + patches_x += grid_x, jc += 1) { + slice_coordinates slice; + slice.x = patches_x; + slice.y = patches_y; + slice.size.width = grid_x; + slice.size.height = grid_y; + res.slices.push_back(slice); + LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n", + __func__, (int)res.slices.size() - 1, + slice.x, slice.y, slice.size.width, slice.size.height); + } + } + } + + return res; +} + +std::vector mtmd_image_preprocessor_llava_uhd::slice_image(const clip_image_u8 & img, const mtmd_image_preprocessor_llava_uhd::slice_instructions & inst, bool overview_first) { + std::vector output; + + // resize to overview size + clip_image_u8_ptr resized_img(clip_image_u8_init()); + img_tool::resize(img, *resized_img, inst.overview_size, hparams.image_resize_algo_ov, + hparams.image_pad_ov, hparams.image_pad_color_ov); + if (overview_first) { + output.push_back(std::move(resized_img)); + } + + if (inst.slices.empty()) { + // no slices, just return the resized image + if (!overview_first) { + output.push_back(std::move(resized_img)); + } + return output; + } + + // resize to refined size + clip_image_u8_ptr refined_img(clip_image_u8_init()); + img_tool::resize(img, *refined_img, inst.refined_size, hparams.image_resize_algo_rf, + hparams.image_pad_rf, hparams.image_pad_color_rf); + + // create slices + for (const auto & slice : inst.slices) { + int x = slice.x; + int y = slice.y; + int w = slice.size.width; + int h = slice.size.height; + + clip_image_u8_ptr img_slice(clip_image_u8_init()); + img_tool::crop(*refined_img, *img_slice, x, y, w, h); + output.push_back(std::move(img_slice)); + } + + if (!overview_first) { + output.push_back(std::move(resized_img)); + } + + return output; +} + +clip_image_size mtmd_image_preprocessor_llava_uhd::get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale) { + int width = original_size.width; + int height = original_size.height; + if ((width * height > scale_resolution * scale_resolution) || allow_upscale) { + float r = static_cast(width) / height; + height = static_cast(scale_resolution / std::sqrt(r)); + width = static_cast(height * r); + } + clip_image_size res; + res.width = ensure_divide(width, patch_size); + res.height = ensure_divide(height, patch_size); + return res; +} + +clip_image_size mtmd_image_preprocessor_llava_uhd::resize_maintain_aspect_ratio(const clip_image_size & orig, const clip_image_size & target_max) { + float scale_width = static_cast(target_max.width) / orig.width; + float scale_height = static_cast(target_max.height) / orig.height; + float scale = std::min(scale_width, scale_height); + return clip_image_size{ + static_cast(orig.width * scale), + static_cast(orig.height * scale), + }; +} + +clip_image_size mtmd_image_preprocessor_llava_uhd::select_best_resolution(const clip_image_size & original_size, const std::vector & possible_resolutions) { + clip_image_size best_fit; + int min_wasted_area = std::numeric_limits::max(); + int max_effective_resolution = 0; + + for (const clip_image_size & candidate : possible_resolutions) { + auto target_size = resize_maintain_aspect_ratio(original_size, candidate); + int effective_resolution = std::min( + target_size.width * target_size.height, + original_size.width * original_size.height); + int wasted_area = (candidate.width * candidate.height) - effective_resolution; + + if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_area < min_wasted_area)) { + max_effective_resolution = effective_resolution; + min_wasted_area = wasted_area; + best_fit = candidate; + } + + LOG_DBG("%s: candidate: %d x %d, target: %d x %d, wasted: %d, effective: %d\n", __func__, candidate.width, candidate.height, target_size.width, target_size.height, wasted_area, effective_resolution); + } + + return best_fit; +} + +int mtmd_image_preprocessor_llava_uhd::ensure_divide(int length, int patch_size) { + return std::max(static_cast(std::round(static_cast(length) / patch_size) * patch_size), patch_size); +} + +clip_image_size mtmd_image_preprocessor_llava_uhd::get_refine_size(const clip_image_size & original_size, const clip_image_size & grid, int scale_resolution, int patch_size, bool allow_upscale) { + int width = original_size.width; + int height = original_size.height; + int grid_x = grid.width; + int grid_y = grid.height; + + int refine_width = ensure_divide(width, grid_x); + int refine_height = ensure_divide(height, grid_y); + + clip_image_size grid_size; + grid_size.width = refine_width / grid_x; + grid_size.height = refine_height / grid_y; + + auto best_grid_size = get_best_resize(grid_size, scale_resolution, patch_size, allow_upscale); + int best_grid_width = best_grid_size.width; + int best_grid_height = best_grid_size.height; + + clip_image_size refine_size; + refine_size.width = best_grid_width * grid_x; + refine_size.height = best_grid_height * grid_y; + return refine_size; +} + +clip_image_size mtmd_image_preprocessor_llava_uhd::get_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) { + std::vector candidate_split_grids_nums; + for (int i : {multiple - 1, multiple, multiple + 1}) { + if (i == 1 || i > max_slice_nums) { + continue; + } + candidate_split_grids_nums.push_back(i); + } + + std::vector candidate_grids; + for (int split_grids_nums : candidate_split_grids_nums) { + int m = 1; + while (m <= split_grids_nums) { + if (split_grids_nums % m == 0) { + candidate_grids.push_back(clip_image_size{m, split_grids_nums / m}); + } + ++m; + } + } + + clip_image_size best_grid{1, 1}; + float min_error = std::numeric_limits::infinity(); + for (const auto& grid : candidate_grids) { + float error = std::abs(log_ratio - std::log(1.0 * grid.width / grid.height)); + if (error < min_error) { + best_grid = grid; + min_error = error; + } + } + return best_grid; +} + +// +// mtmd_image_preprocessor_fixed_size +// + +bool mtmd_image_preprocessor_fixed_size::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { + clip_image_u8 resized_image; + int sz = hparams.image_size; + img_tool::resize(img, resized_image, {sz, sz}, + hparams.image_resize_algo, + hparams.image_resize_pad, + hparams.image_pad_color); + clip_image_f32_ptr img_f32(clip_image_f32_init()); + img_u8_to_f32(resized_image, *img_f32, hparams.image_mean, hparams.image_std); + output.entries.push_back(std::move(img_f32)); + return true; +} + +// +// mtmd_image_preprocessor_dyn_size +// + +bool mtmd_image_preprocessor_dyn_size::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { + GGML_ASSERT(hparams.image_min_pixels > 0 && hparams.image_max_pixels > 0); + clip_image_u8 resized_image; + const clip_image_size original_size{img.nx, img.ny}; + // the original pixtral model doesn't have n_merge + const int cur_merge = hparams.n_merge == 0 ? 1 : hparams.n_merge; + const clip_image_size target_size = img_tool::calc_size_preserved_ratio( + original_size, + hparams.patch_size * cur_merge, + hparams.image_min_pixels, + hparams.image_max_pixels); + img_tool::resize(img, resized_image, target_size, + hparams.image_resize_algo, + hparams.image_resize_pad, + hparams.image_pad_color); + clip_image_f32_ptr img_f32(clip_image_f32_init()); + img_u8_to_f32(resized_image, *img_f32, hparams.image_mean, hparams.image_std); + output.entries.push_back(std::move(img_f32)); + return true; +} + +// +// mtmd_image_preprocessor_longest_edge +// + +bool mtmd_image_preprocessor_longest_edge::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { + GGML_ASSERT(hparams.image_longest_edge > 0); + clip_image_u8 resized_image; + const clip_image_size original_size{img.nx, img.ny}; + // the original pixtral model doesn't have n_merge + const int cur_merge = hparams.n_merge == 0 ? 1 : hparams.n_merge; + const clip_image_size target_size = img_tool::calc_size_preserved_ratio( + original_size, + hparams.patch_size * cur_merge, + hparams.image_longest_edge); + img_tool::resize(img, resized_image, target_size, + hparams.image_resize_algo, + hparams.image_resize_pad, + hparams.image_pad_color); + clip_image_f32_ptr img_f32(clip_image_f32_init()); + img_u8_to_f32(resized_image, *img_f32, hparams.image_mean, hparams.image_std); + output.entries.push_back(std::move(img_f32)); + return true; +} + +// +// mtmd_image_preprocessor_lfm2 +// + +mtmd_image_preprocessor_llava_uhd::slice_instructions mtmd_image_preprocessor_lfm2::get_slice_instructions(const clip_image_size & original_size) { + mtmd_image_preprocessor_llava_uhd::slice_instructions inst; + const int align_size = hparams.patch_size * hparams.n_merge; + inst.overview_size = img_tool::calc_size_preserved_ratio( + original_size, align_size, + hparams.image_min_pixels, hparams.image_max_pixels); + // tile if either dimension exceeds tile_size with tolerance + const bool needs_tiling = original_size.width > tile_size * max_pixels_tolerance || original_size.height > tile_size * max_pixels_tolerance; + + if (!needs_tiling) { + inst.refined_size = clip_image_size{0, 0}; + inst.grid_size = clip_image_size{0, 0}; + return inst; + } + + const clip_image_size grid = get_grid_layout(original_size.height, original_size.width); + + inst.grid_size = grid; + inst.refined_size = clip_image_size{tile_size * grid.width, tile_size * grid.height}; + + LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n", + __func__, + original_size.width, original_size.height, + inst.overview_size.width, inst.overview_size.height, + inst.refined_size.width, inst.refined_size.height, + grid.width, grid.height); + + for (int row = 0; row < grid.height; row++) { + for (int col = 0; col < grid.width; col++) { + mtmd_image_preprocessor_llava_uhd::slice_coordinates slice; + slice.x = col * tile_size; + slice.y = row * tile_size; + slice.size = clip_image_size{tile_size, tile_size}; + inst.slices.push_back(slice); + LOG_DBG("%s: slice %d: x=%d, y=%d, size=%d x %d\n", + __func__, (int)inst.slices.size() - 1, + slice.x, slice.y, slice.size.width, slice.size.height); + } + } + + return inst; +} + +clip_image_size mtmd_image_preprocessor_lfm2::find_closest_aspect_ratio( + float aspect_ratio, + const std::vector & target_ratios, + int width, int height) { + float best_ratio_diff = std::numeric_limits::max(); + clip_image_size best_ratio = {1, 1}; + const float area = static_cast(width * height); + + for (const auto & ratio : target_ratios) { + const float target_aspect_ratio = static_cast(ratio.width) / ratio.height; + const float ratio_diff = std::abs(aspect_ratio - target_aspect_ratio); + if (ratio_diff < best_ratio_diff) { + best_ratio_diff = ratio_diff; + best_ratio = ratio; + } else if (ratio_diff == best_ratio_diff) { + const float target_area = static_cast(tile_size * tile_size * ratio.width * ratio.height); + if (area > 0.5f * target_area) { + best_ratio = ratio; + } + } + } + return best_ratio; +} + +std::vector mtmd_image_preprocessor_lfm2::get_target_ratios() { + std::vector ratios; + for (int n = min_tiles; n <= max_tiles; n++) { + for (int w = 1; w <= n; w++) { + for (int h = 1; h <= n; h++) { + if (w * h >= min_tiles && w * h <= max_tiles) { + bool found = false; + for (const auto & r : ratios) { + if (r.width == w && r.height == h) { + found = true; + break; + } + } + if (!found) { + ratios.push_back({w, h}); + } + } + } + } + } + std::sort(ratios.begin(), ratios.end(), [](const clip_image_size & a, const clip_image_size & b) { + return a.width * a.height < b.width * b.height; + }); + return ratios; +} + +clip_image_size mtmd_image_preprocessor_lfm2::get_grid_layout(int height, int width) { + const float aspect_ratio = static_cast(width) / height; + const auto ratios = get_target_ratios(); + return find_closest_aspect_ratio(aspect_ratio, ratios, width, height); +} + +// +// mtmd_image_preprocessor_idefics3 +// + +bool mtmd_image_preprocessor_idefics3::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { + // The refined size has two steps: + // 1. Resize w/ aspect-ratio preserving such that the longer side is + // the preprocessor longest size + // 2. Resize w/out preserving aspect ratio such that both sides are + // multiples of image_size (always rounding up) + // + // CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737 + const clip_image_size original_size{img.nx, img.ny}; + const clip_image_size refined_size = img_tool::calc_size_preserved_ratio( + original_size, hparams.image_size, hparams.image_longest_edge); + // LOG_INF("%s: original size: %d x %d, refined size: %d x %d\n", + // __func__, original_size.width, original_size.height, + // refined_size.width, refined_size.height); + + mtmd_image_preprocessor_llava_uhd::slice_instructions instructions; + instructions.overview_size = clip_image_size{hparams.image_size, hparams.image_size}; + instructions.refined_size = refined_size; + instructions.grid_size = clip_image_size{ + static_cast(std::ceil(static_cast(refined_size.width) / hparams.image_size)), + static_cast(std::ceil(static_cast(refined_size.height) / hparams.image_size)), + }; + for (int y = 0; y < refined_size.height; y += hparams.image_size) { + for (int x = 0; x < refined_size.width; x += hparams.image_size) { + // LOG_INF("%s: adding slice at x=%d, y=%d\n", __func__, x, y); + instructions.slices.push_back(mtmd_image_preprocessor_llava_uhd::slice_coordinates{ + /* x */x, + /* y */y, + /* size */clip_image_size{ + std::min(hparams.image_size, refined_size.width - x), + std::min(hparams.image_size, refined_size.height - y) + } + }); + } + } + auto imgs = slice_image(img, instructions); + + // cast and normalize to f32 + for (size_t i = 0; i < imgs.size(); ++i) { + // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp"); + clip_image_f32_ptr res(clip_image_f32_init()); + img_u8_to_f32(*imgs[i], *res, hparams.image_mean, hparams.image_std); + output.entries.push_back(std::move(res)); + } + + output.grid_x = instructions.grid_size.width; + output.grid_y = instructions.grid_size.height; + return true; +} + +// +// mtmd_image_preprocessor_internvl +// + +bool mtmd_image_preprocessor_internvl::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { + GGML_ASSERT(!hparams.image_res_candidates.empty()); + const clip_image_size original_size{img.nx, img.ny}; + auto const inst = get_slice_instructions(original_size); + std::vector imgs = slice_image(img, inst, false); + + for (size_t i = 0; i < imgs.size(); ++i) { + clip_image_f32_ptr res(clip_image_f32_init()); + img_u8_to_f32(*imgs[i], *res, hparams.image_mean, hparams.image_std); + output.entries.push_back(std::move(res)); + } + return true; +} + +// +// mtmd_image_preprocessor_deepseekocr +// + +bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { + const std::vector native_resolutions = { + /*512 tiny , 640 small, */ 1024 /* base */, 1280 /* large */ + }; + // original image size + const clip_image_size original_size{img.nx, img.ny}; + const int orig_w = original_size.width; + const int orig_h = original_size.height; + const int orig_area = orig_h * orig_w; + + size_t mode_i = 0; + int min_diff = orig_area; + + for (size_t i = 0; i < native_resolutions.size(); i++) { + int r = native_resolutions[i]; + if (std::abs(orig_area - r * r) < min_diff) { + mode_i = i; + min_diff = std::abs(orig_area - r * r); + } + } + + /* Native Resolution (Base/Large) */ + const int image_size = native_resolutions[mode_i]; + + // scaled and padded image + clip_image_u8_ptr scaled_img(clip_image_u8_init()); + img_tool::resize(img, *scaled_img, clip_image_size{image_size, image_size}, hparams.image_resize_algo); + + clip_image_f32_ptr res(clip_image_f32_init()); + img_u8_to_f32(*scaled_img, *res, hparams.image_mean, hparams.image_std); + output.entries.push_back(std::move(res)); + + output.grid_x = 1; + output.grid_y = 1; + return true; +} + +// +// mtmd_image_preprocessor_youtuvl +// + +bool mtmd_image_preprocessor_youtuvl::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { + const int patch_size = hparams.patch_size; // typically 16 + const int merge_size = hparams.n_merge; // typically 2 + const int align_size = patch_size * merge_size; // 32 + + const int max_num_patches = hparams.image_max_pixels > 0 ? + hparams.image_max_pixels / (patch_size * patch_size) : 256; + + // Linear search for optimal scale to fit within max_num_patches + float scale = 1.0f; + int target_height = img.ny; + int target_width = img.nx; + + auto get_scaled_image_size = [align_size](float scale, int size) -> int { + float scaled_size = size * scale; + // Round up to nearest multiple of align_size + int aligned = static_cast(std::ceil(scaled_size / align_size)) * align_size; + // Ensure at least one patch + return std::max(align_size, aligned); + }; + + // Linear search with 0.02 step size + while (scale > 0.0f) { + target_height = get_scaled_image_size(scale, img.ny); + target_width = get_scaled_image_size(scale, img.nx); + + int num_patches_h = target_height / patch_size; + int num_patches_w = target_width / patch_size; + int num_patches = num_patches_h * num_patches_w; + + if (num_patches > max_num_patches) { + scale -= 0.02f; + } else { + break; + } + } + + clip_image_size new_size = {target_width, target_height}; + + // Resize the image + clip_image_u8 resized; + img_tool::resize(img, resized, new_size, hparams.image_resize_algo, hparams.image_resize_pad); + + // Normalize to float32 + clip_image_f32_ptr img_f32(clip_image_f32_init()); + img_u8_to_f32(resized, *img_f32, hparams.image_mean, hparams.image_std); + // Add to results + output.entries.push_back(std::move(img_f32)); + return true; +} diff --git a/tools/mtmd/mtmd-image.h b/tools/mtmd/mtmd-image.h new file mode 100644 index 0000000000..065b937d61 --- /dev/null +++ b/tools/mtmd/mtmd-image.h @@ -0,0 +1,150 @@ +#pragma once + +#include "ggml.h" +#include "clip-model.h" + +#include +#include + +#define MTMD_INTERNAL_HEADER + +// base class, models must inherit from this class +struct mtmd_image_preprocessor { + const clip_hparams & hparams; + + mtmd_image_preprocessor(const clip_ctx * ctx): hparams(*clip_get_hparams(ctx)) {} + + virtual ~mtmd_image_preprocessor() = default; + virtual bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) = 0; + + void img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]); + void img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst); +}; + +/** + * implementation of LLaVA-UHD: + * - https://arxiv.org/pdf/2403.11703 + * - https://github.com/thunlp/LLaVA-UHD + * - https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118 + * + * overview: + * - an image always have a single overview (downscaled image) + * - an image can have 0 or multiple slices, depending on the image size + * - each slice can then be considered as a separate image + * + * note: the term "slice" and "tile" are used interchangeably + * + * for example: + * + * [overview] --> [slice 1] --> [slice 2] + * | | + * +--> [slice 3] --> [slice 4] + */ +struct mtmd_image_preprocessor_llava_uhd : mtmd_image_preprocessor { + mtmd_image_preprocessor_llava_uhd(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {} + bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; + + struct slice_coordinates { + int x; + int y; + clip_image_size size; + }; + + struct slice_instructions { + clip_image_size overview_size; // size of downscaled image + clip_image_size refined_size; // size of image right before slicing (must be multiple of slice size) + clip_image_size grid_size; // grid_size.width * grid_size.height = number of slices + std::vector slices; + }; + + // LFM2 override this function to implement its custom slicing logic + virtual slice_instructions get_slice_instructions(const clip_image_size & original_size); + + std::vector slice_image(const clip_image_u8 & img, const slice_instructions & inst, bool overview_first = true); + +private: + clip_image_size get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale = false); + + clip_image_size resize_maintain_aspect_ratio(const clip_image_size & orig, const clip_image_size & target_max); + + /** + * Selects the best resolution from a list of possible resolutions based on the original size. + * + * For example, when given a list of resolutions: + * - 100x100 + * - 200x100 + * - 100x200 + * - 200x200 + * + * And an input image of size 111x200, then 100x200 is the best fit (least wasted resolution). + * + * @param original_size The original size of the image + * @param possible_resolutions A list of possible resolutions + * @return The best fit resolution + */ + clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector & possible_resolutions); + int ensure_divide(int length, int patch_size); + clip_image_size get_refine_size(const clip_image_size & original_size, const clip_image_size & grid, int scale_resolution, int patch_size, bool allow_upscale = false); + clip_image_size get_best_grid(const int max_slice_nums, const int multiple, const float log_ratio); +}; + +// downscale or upscale the input image to fixed size +struct mtmd_image_preprocessor_fixed_size : mtmd_image_preprocessor { + mtmd_image_preprocessor_fixed_size(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {} + bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; +}; + +// resize image to multiple of patch_size*n_merge, while preserving aspect ratio +// if image_resize_pad is true, the resized image will be padded, otherwise it will be either stretched or center-cropped depending on image_resize_pad +// this is used by models with native support for dynamic image size, for example: Qwen-VL, Pixtral, Kimi-VL, etc +struct mtmd_image_preprocessor_dyn_size : mtmd_image_preprocessor { + mtmd_image_preprocessor_dyn_size(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {} + bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; +}; + +// similar to mtmd_image_preprocessor_dyn_size, but resize the image to have longest edge equal to hparams.image_longest_edge, while preserving aspect ratio +struct mtmd_image_preprocessor_longest_edge : mtmd_image_preprocessor { + mtmd_image_preprocessor_longest_edge(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {} + bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; +}; + +// custom llava-uhd slicing logic for LFM2 +// ref: https://github.com/huggingface/transformers/blob/v5.1.0/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +struct mtmd_image_preprocessor_lfm2 : mtmd_image_preprocessor_llava_uhd { + // ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json + static constexpr int min_tiles = 2; + static constexpr int max_tiles = 10; + static constexpr float max_pixels_tolerance = 2.0f; + static constexpr int tile_size = 512; + + using mtmd_image_preprocessor_llava_uhd::mtmd_image_preprocessor_llava_uhd; + slice_instructions get_slice_instructions(const clip_image_size & original_size) override; + +private: + clip_image_size find_closest_aspect_ratio( + float aspect_ratio, + const std::vector & target_ratios, + int width, int height); + std::vector get_target_ratios(); + clip_image_size get_grid_layout(int height, int width); +}; + +struct mtmd_image_preprocessor_idefics3 : mtmd_image_preprocessor_llava_uhd { + mtmd_image_preprocessor_idefics3(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {} + bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; +}; + +struct mtmd_image_preprocessor_internvl : mtmd_image_preprocessor_llava_uhd { + mtmd_image_preprocessor_internvl(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {} + bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; +}; + +struct mtmd_image_preprocessor_deepseekocr : mtmd_image_preprocessor { + mtmd_image_preprocessor_deepseekocr(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {} + bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; +}; + +struct mtmd_image_preprocessor_youtuvl : mtmd_image_preprocessor { + mtmd_image_preprocessor_youtuvl(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {} + bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; +}; diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 456ce7b73c..d078120f76 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -2,6 +2,7 @@ #include "clip-impl.h" #include "mtmd.h" #include "mtmd-audio.h" +#include "mtmd-image.h" #include "debug/mtmd-debug.h" #include "llama.h" @@ -138,7 +139,7 @@ struct mtmd_context { // for llava-uhd style models, we need special tokens in-between slices // minicpmv calls them "slices", llama 4 calls them "tiles" - mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE; + mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE; std::vector tok_ov_img_start; // overview image std::vector tok_ov_img_end; // overview image std::vector tok_slices_start; // start of all slices @@ -147,13 +148,14 @@ struct mtmd_context { std::vector tok_sli_img_end; // single slice end std::vector tok_sli_img_mid; // between 2 slices std::vector tok_row_end; // end of row - bool tok_row_end_trail = false; - bool ov_img_first = false; + bool tok_row_end_trail = false; + bool ov_img_first = false; // string template for slice image delimiters with row/col (idefics3) std::string sli_img_start_tmpl; std::unique_ptr audio_preproc; + std::unique_ptr image_preproc; // TODO @ngxson : add timings @@ -221,123 +223,193 @@ struct mtmd_context { void init_vision() { GGML_ASSERT(ctx_v != nullptr); + image_preproc.reset(); projector_type proj = clip_get_projector_type(ctx_v); - int minicpmv_version = clip_is_minicpmv(ctx_v); - if (minicpmv_version == 2) { - // minicpmv 2.5 format: - // (overview) (slice) (slice) \n ... - slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_5; - tok_ov_img_start = {lookup_token("")}; - tok_ov_img_end = {lookup_token("")}; - tok_slices_start = {lookup_token("")}; - tok_slices_end = {lookup_token("")}; - tok_sli_img_start = tok_ov_img_start; - tok_sli_img_end = tok_ov_img_end; - tok_row_end = {lookup_token("\n")}; - tok_row_end_trail = false; // no trailing end-of-row token - ov_img_first = true; - } else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6 || minicpmv_version == 100045) { - // minicpmv 2.6 format: - // (overview) (slice) (slice) \n ... - slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6; - tok_ov_img_start = {lookup_token("")}; - tok_ov_img_end = {lookup_token("")}; - tok_sli_img_start = {lookup_token("")}; - tok_sli_img_end = {lookup_token("")}; - tok_row_end = {lookup_token("\n")}; - tok_row_end_trail = false; // no trailing end-of-row token - ov_img_first = true; + switch (proj) { + case PROJECTOR_TYPE_MLP: + case PROJECTOR_TYPE_MLP_NORM: + case PROJECTOR_TYPE_LDP: + case PROJECTOR_TYPE_LDPV2: + case PROJECTOR_TYPE_COGVLM: + case PROJECTOR_TYPE_JANUS_PRO: + case PROJECTOR_TYPE_GLM_EDGE: + { + bool has_pinpoints = !clip_get_hparams(ctx_v)->image_res_candidates.empty(); + if (has_pinpoints) { + image_preproc = std::make_unique(ctx_v); + } else { + image_preproc = std::make_unique(ctx_v); + } + } break; + case PROJECTOR_TYPE_MINICPMV: + { + int minicpmv_version = clip_is_minicpmv(ctx_v); + if (minicpmv_version == 2) { + // minicpmv 2.5 format: + // (overview) (slice) (slice) \n ... + slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_5; + tok_ov_img_start = {lookup_token("")}; + tok_ov_img_end = {lookup_token("")}; + tok_slices_start = {lookup_token("")}; + tok_slices_end = {lookup_token("")}; + tok_sli_img_start = tok_ov_img_start; + tok_sli_img_end = tok_ov_img_end; + tok_row_end = {lookup_token("\n")}; + tok_row_end_trail = false; // no trailing end-of-row token + ov_img_first = true; - } else if (minicpmv_version != 0) { - GGML_ASSERT(false && "unsupported minicpmv version"); - } else if (proj == PROJECTOR_TYPE_LLAMA4) { - // llama 4 format: - // <|image_start|> - // (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|> - // (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|> - // ... <|tile_y_separator|> <-- trailing end-of-row token - // <|image|> (overview) <-- overview image is last - // <|image_end|> - slice_tmpl = MTMD_SLICE_TMPL_LLAMA4; - tok_ov_img_start = {lookup_token("<|image|>")}; - tok_sli_img_mid = {lookup_token("<|tile_x_separator|>")}; - tok_row_end = {lookup_token("<|tile_y_separator|>")}; - tok_row_end_trail = true; // add trailing end-of-row token - ov_img_first = false; // overview image is last + } else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6 || minicpmv_version == 100045) { + // minicpmv 2.6 format: + // (overview) (slice) (slice) \n ... + slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6; + tok_ov_img_start = {lookup_token("")}; + tok_ov_img_end = {lookup_token("")}; + tok_sli_img_start = {lookup_token("")}; + tok_sli_img_end = {lookup_token("")}; + tok_row_end = {lookup_token("\n")}; + tok_row_end_trail = false; // no trailing end-of-row token + ov_img_first = true; + + } else if (minicpmv_version != 0) { + throw std::runtime_error(string_format("unsupported minicpmv version: %d\n", minicpmv_version)); + } + image_preproc = std::make_unique(ctx_v); + } break; + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN25VL: + case PROJECTOR_TYPE_QWEN3VL: + { + // <|vision_start|> ... (image embeddings) ... <|vision_end|> + img_beg = "<|vision_start|>"; + img_end = "<|vision_end|>"; + image_preproc = std::make_unique(ctx_v); + } break; + case PROJECTOR_TYPE_YOUTUVL: + { + // <|vision_start|> ... (image embeddings) ... <|vision_end|> + img_beg = "<|vision_start|>"; + img_end = "<|vision_end|>"; + image_preproc = std::make_unique(ctx_v); + } break; + case PROJECTOR_TYPE_GEMMA3: + case PROJECTOR_TYPE_GEMMA3NV: + { + // ... (image embeddings) ... + img_beg = ""; + img_end = ""; + image_preproc = std::make_unique(ctx_v); + } break; + case PROJECTOR_TYPE_IDEFICS3: + { + // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215 + slice_tmpl = MTMD_SLICE_TMPL_IDEFICS3; + tok_ov_img_start = {lookup_token("\n\n"), lookup_token(""), lookup_token("")}; + tok_ov_img_end = {lookup_token("")}; + tok_row_end = {lookup_token("\n")}; + sli_img_start_tmpl = ""; + image_preproc = std::make_unique(ctx_v); + } break; + case PROJECTOR_TYPE_PIXTRAL: + { + // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md + img_end = "[IMG_END]"; + image_preproc = std::make_unique(ctx_v); + } break; + case PROJECTOR_TYPE_PHI4: + { + // Phi-4 uses media marker insertion only. Keep image boundary text empty. + image_preproc = std::make_unique(ctx_v); + } break; + case PROJECTOR_TYPE_LLAMA4: + { + // (more details in mtmd_context constructor) + img_beg = "<|image_start|>"; + img_end = "<|image_end|>"; + LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n" + " https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__); + image_preproc = std::make_unique(ctx_v); + } break; + case PROJECTOR_TYPE_INTERNVL: + { + // ... (image embeddings) ... + img_beg = ""; + img_end = ""; + image_preproc = std::make_unique(ctx_v); + } break; + case PROJECTOR_TYPE_KIMIVL: + { + // <|media_start|> ... (image embeddings) ... <|media_end|> + img_beg = "<|media_start|>"; + img_end = "<|media_end|>"; + image_preproc = std::make_unique(ctx_v); + } break; + case PROJECTOR_TYPE_KIMIK25: + { + // <|media_begin|> ... (image embeddings) ... <|media_end|> + img_beg = "<|media_begin|>"; + img_end = "<|media_end|>"; + image_preproc = std::make_unique(ctx_v); + } break; + case PROJECTOR_TYPE_LIGHTONOCR: + { + // <|im_start|> ... (image embeddings) ... <|im_end|> + img_beg = "<|im_start|>"; + img_end = "<|im_end|>"; + image_preproc = std::make_unique(ctx_v); + } break; + case PROJECTOR_TYPE_NEMOTRON_V2_VL: + { + image_preproc = std::make_unique(ctx_v); + } break; + case PROJECTOR_TYPE_LFM2: + { + // multi-tile: + // <|image_start|> + // <|img_row_1_col_1|> (tile) <|img_row_1_col_2|> (tile) ... + // <|img_thumbnail|> (thumbnail) + // <|image_end|> + // single-tile: + // <|image_start|> (image) <|image_end|> + img_beg = "<|image_start|>"; + img_end = "<|image_end|>"; + slice_tmpl = MTMD_SLICE_TMPL_LFM2; + sli_img_start_tmpl = "<|img_row_%d_col_%d|>"; + tok_ov_img_start = {lookup_token("<|img_thumbnail|>")}; + ov_img_first = false; + image_preproc = std::make_unique(ctx_v); + } break; + case PROJECTOR_TYPE_GLM4V: + { + // <|begin_of_image|> ... (image embeddings) ... <|end_of_image|> + img_beg = "<|begin_of_image|>"; + img_end = "<|end_of_image|>"; + image_preproc = std::make_unique(ctx_v); + } break; + case PROJECTOR_TYPE_PADDLEOCR: + { + // <|IMAGE_START|> ... (image embeddings) ... <|IMAGE_END|> + img_beg = "<|IMAGE_START|>"; + img_end = "<|IMAGE_END|>"; + image_preproc = std::make_unique(ctx_v); + } break; + case PROJECTOR_TYPE_DEEPSEEKOCR: + { + img_end = "\n"; // prevent empty batch on llama-server + image_preproc = std::make_unique(ctx_v); + } break; + default: + throw std::runtime_error(string_format("%s: unexpected vision projector type %d\n", __func__, proj)); } - // set boi/eoi - if (proj == PROJECTOR_TYPE_GEMMA3 || proj == PROJECTOR_TYPE_GEMMA3NV) { - // ... (image embeddings) ... - img_beg = ""; - img_end = ""; - - } else if (proj == PROJECTOR_TYPE_IDEFICS3) { - // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215 - slice_tmpl = MTMD_SLICE_TMPL_IDEFICS3; - tok_ov_img_start = {lookup_token("\n\n"), lookup_token(""), lookup_token("")}; - tok_ov_img_end = {lookup_token("")}; - tok_row_end = {lookup_token("\n")}; - sli_img_start_tmpl = ""; - - } else if (proj == PROJECTOR_TYPE_PIXTRAL) { - // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md - img_end = "[IMG_END]"; - - } else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL || proj == PROJECTOR_TYPE_YOUTUVL) { - // <|vision_start|> ... (image embeddings) ... <|vision_end|> - img_beg = "<|vision_start|>"; - img_end = "<|vision_end|>"; - - } else if (proj == PROJECTOR_TYPE_PHI4) { - // Phi-4 uses media marker insertion only. Keep image boundary text empty. - - } else if (proj == PROJECTOR_TYPE_LLAMA4) { - // (more details in mtmd_context constructor) - img_beg = "<|image_start|>"; - img_end = "<|image_end|>"; - LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n" - " https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__); - - } else if (proj == PROJECTOR_TYPE_INTERNVL) { - // ... (image embeddings) ... - img_beg = ""; - img_end = ""; - - } else if (proj == PROJECTOR_TYPE_LIGHTONOCR) { - // <|im_start|> ... (image embeddings) ... <|im_end|> - img_beg = "<|im_start|>"; - img_end = "<|im_end|>"; - - } else if (proj == PROJECTOR_TYPE_LFM2) { - // multi-tile: - // <|image_start|> - // <|img_row_1_col_1|> (tile) <|img_row_1_col_2|> (tile) ... - // <|img_thumbnail|> (thumbnail) - // <|image_end|> - // single-tile: - // <|image_start|> (image) <|image_end|> - img_beg = "<|image_start|>"; - img_end = "<|image_end|>"; - slice_tmpl = MTMD_SLICE_TMPL_LFM2; - sli_img_start_tmpl = "<|img_row_%d_col_%d|>"; - tok_ov_img_start = {lookup_token("<|img_thumbnail|>")}; - ov_img_first = false; - } else if (proj == PROJECTOR_TYPE_GLM4V) { - img_beg = "<|begin_of_image|>"; - img_end = "<|end_of_image|>"; - - } else if (proj == PROJECTOR_TYPE_PADDLEOCR) { - // <|IMAGE_START|> ... (image embeddings) ... <|IMAGE_END|> - img_beg = "<|IMAGE_START|>"; - img_end = "<|IMAGE_END|>"; - } + GGML_ASSERT(image_preproc != nullptr); } void init_audio() { GGML_ASSERT(ctx_a != nullptr); + audio_preproc.reset(); + projector_type proj = clip_get_projector_type(ctx_a); LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n" @@ -347,36 +419,40 @@ struct mtmd_context { switch (proj) { case PROJECTOR_TYPE_QWEN2A: case PROJECTOR_TYPE_QWEN25O: - case PROJECTOR_TYPE_ULTRAVOX: + { + // <|audio_bos|> ... (embeddings) ... <|audio_eos|> + aud_beg = "<|audio_bos|>"; + aud_end = "<|audio_eos|>"; + audio_preproc = std::make_unique(ctx_a); + } break; case PROJECTOR_TYPE_VOXTRAL: - case PROJECTOR_TYPE_GLMA: + { + // [BEGIN_AUDIO] ... (embeddings) ... + aud_beg = "[BEGIN_AUDIO]"; + audio_preproc = std::make_unique(ctx_a); + } break; case PROJECTOR_TYPE_MUSIC_FLAMINGO: - audio_preproc = std::make_unique(ctx_a); - break; + { + // ... (embeddings) ... + aud_beg = ""; + audio_preproc = std::make_unique(ctx_a); + } break; + case PROJECTOR_TYPE_ULTRAVOX: + case PROJECTOR_TYPE_GLMA: + { + audio_preproc = std::make_unique(ctx_a); + } break; case PROJECTOR_TYPE_LFM2A: - audio_preproc = std::make_unique(ctx_a); - break; + { + audio_preproc = std::make_unique(ctx_a); + } break; default: - GGML_ABORT("unsupported audio projector type"); + throw std::runtime_error(string_format("%s: unexpected audio projector type %d\n", __func__, proj)); } // initialize audio preprocessor + GGML_ASSERT(audio_preproc != nullptr); audio_preproc->initialize(); - - // set special tokens - if (proj == PROJECTOR_TYPE_QWEN2A) { - // <|audio_bos|> ... (embeddings) ... <|audio_eos|> - aud_beg = "<|audio_bos|>"; - aud_end = "<|audio_eos|>"; - - } else if (proj == PROJECTOR_TYPE_ULTRAVOX) { - // [BEGIN_AUDIO] ... (embeddings) ... - aud_beg = "[BEGIN_AUDIO]"; - - } else if (proj == PROJECTOR_TYPE_MUSIC_FLAMINGO) { - // ... (embeddings) ... - aud_beg = ""; - } } // get clip ctx based on chunk type @@ -573,8 +649,9 @@ struct mtmd_tokenizer { std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3); // preprocess image + GGML_ASSERT(ctx->image_preproc != nullptr); clip_image_f32_batch batch_f32; - bool ok = clip_image_preprocess(ctx->ctx_v, img_u8.get(), &batch_f32); + bool ok = ctx->image_preproc->preprocess(*img_u8, batch_f32); if (!ok) { LOG_ERR("Unable to preprocess image\n"); return 2; @@ -1225,7 +1302,8 @@ void mtmd_debug_preprocess_image(mtmd_context * ctx, const std::vector img_u8.ny = ny; img_u8.buf = rgb_values; clip_image_f32_batch batch_f32; - bool ok = clip_image_preprocess(ctx->ctx_v, &img_u8, &batch_f32); + GGML_ASSERT(ctx->image_preproc != nullptr); + bool ok = ctx->image_preproc->preprocess(img_u8, batch_f32); if (!ok) { LOG_ERR("%s: failed to preprocess image\n", __func__); return;