mtmd : fix LightOnOCR image preprocessing (#20877)
This commit is contained in:
parent
49bfddeca1
commit
d3ac030a5d
|
|
@ -1161,7 +1161,6 @@ struct clip_model_loader {
|
||||||
hparams.set_warmup_n_tokens(16*16);
|
hparams.set_warmup_n_tokens(16*16);
|
||||||
} break;
|
} break;
|
||||||
case PROJECTOR_TYPE_PIXTRAL:
|
case PROJECTOR_TYPE_PIXTRAL:
|
||||||
case PROJECTOR_TYPE_LIGHTONOCR:
|
|
||||||
{
|
{
|
||||||
// ref: https://huggingface.co/mistral-community/pixtral-12b/blob/main/preprocessor_config.json
|
// ref: https://huggingface.co/mistral-community/pixtral-12b/blob/main/preprocessor_config.json
|
||||||
// TODO: verify the image_min_tokens
|
// TODO: verify the image_min_tokens
|
||||||
|
|
@ -1171,6 +1170,15 @@ struct clip_model_loader {
|
||||||
hparams.set_limit_image_tokens(8, 1024);
|
hparams.set_limit_image_tokens(8, 1024);
|
||||||
hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
|
hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
|
||||||
} break;
|
} break;
|
||||||
|
case PROJECTOR_TYPE_LIGHTONOCR:
|
||||||
|
{
|
||||||
|
hparams.n_merge = 1;
|
||||||
|
hparams.rope_theta = 10000.0f;
|
||||||
|
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
|
||||||
|
hparams.image_longest_edge = hparams.image_size;
|
||||||
|
get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge, false);
|
||||||
|
hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
|
||||||
|
} break;
|
||||||
case PROJECTOR_TYPE_KIMIVL:
|
case PROJECTOR_TYPE_KIMIVL:
|
||||||
{
|
{
|
||||||
hparams.rope_theta = 10000.0f;
|
hparams.rope_theta = 10000.0f;
|
||||||
|
|
@ -3180,7 +3188,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
||||||
|
|
||||||
case PROJECTOR_TYPE_PHI4:
|
case PROJECTOR_TYPE_PHI4:
|
||||||
case PROJECTOR_TYPE_PIXTRAL:
|
case PROJECTOR_TYPE_PIXTRAL:
|
||||||
case PROJECTOR_TYPE_LIGHTONOCR:
|
|
||||||
{
|
{
|
||||||
GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
|
GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
|
||||||
clip_image_u8 resized_image;
|
clip_image_u8 resized_image;
|
||||||
|
|
@ -3196,6 +3203,19 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
||||||
normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
|
normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
|
||||||
res_imgs->entries.push_back(std::move(img_f32));
|
res_imgs->entries.push_back(std::move(img_f32));
|
||||||
} break;
|
} break;
|
||||||
|
case PROJECTOR_TYPE_LIGHTONOCR:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(params.image_longest_edge > 0);
|
||||||
|
clip_image_u8 resized_image;
|
||||||
|
const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
|
||||||
|
original_size,
|
||||||
|
params.patch_size * params.n_merge,
|
||||||
|
params.image_longest_edge);
|
||||||
|
img_tool::resize(*img, resized_image, target_size, img_tool::RESIZE_ALGO_BICUBIC);
|
||||||
|
clip_image_f32_ptr img_f32(clip_image_f32_init());
|
||||||
|
normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
|
||||||
|
res_imgs->entries.push_back(std::move(img_f32));
|
||||||
|
} break;
|
||||||
|
|
||||||
case PROJECTOR_TYPE_LLAMA4:
|
case PROJECTOR_TYPE_LLAMA4:
|
||||||
{
|
{
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue