mtmd : fix LightOnOCR image preprocessing (#20877)

This commit is contained in:
DorianRudolph 2026-03-23 01:04:14 +01:00 committed by GitHub
parent 49bfddeca1
commit d3ac030a5d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 22 additions and 2 deletions

View File

@ -1161,7 +1161,6 @@ struct clip_model_loader {
hparams.set_warmup_n_tokens(16*16);
} break;
case PROJECTOR_TYPE_PIXTRAL:
case PROJECTOR_TYPE_LIGHTONOCR:
{
// ref: https://huggingface.co/mistral-community/pixtral-12b/blob/main/preprocessor_config.json
// TODO: verify the image_min_tokens
@ -1171,6 +1170,15 @@ struct clip_model_loader {
hparams.set_limit_image_tokens(8, 1024);
hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
} break;
case PROJECTOR_TYPE_LIGHTONOCR:
{
hparams.n_merge = 1;
hparams.rope_theta = 10000.0f;
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
hparams.image_longest_edge = hparams.image_size;
get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge, false);
hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
} break;
case PROJECTOR_TYPE_KIMIVL:
{
hparams.rope_theta = 10000.0f;
@ -3180,7 +3188,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
case PROJECTOR_TYPE_PHI4:
case PROJECTOR_TYPE_PIXTRAL:
case PROJECTOR_TYPE_LIGHTONOCR:
{
GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
clip_image_u8 resized_image;
@ -3196,6 +3203,19 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
res_imgs->entries.push_back(std::move(img_f32));
} break;
case PROJECTOR_TYPE_LIGHTONOCR:
{
GGML_ASSERT(params.image_longest_edge > 0);
clip_image_u8 resized_image;
const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
original_size,
params.patch_size * params.n_merge,
params.image_longest_edge);
img_tool::resize(*img, resized_image, target_size, img_tool::RESIZE_ALGO_BICUBIC);
clip_image_f32_ptr img_f32(clip_image_f32_init());
normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
res_imgs->entries.push_back(std::move(img_f32));
} break;
case PROJECTOR_TYPE_LLAMA4:
{