fix: test-1.jpg ORC issue with small (640) resolution

setting min-resolution base (1024) max large (1280) for dynamic-resolution
2025-12-10 20:20:55 +01:00 · 2025-12-10 20:20:55 +01:00 · ed944cd25b
parent 016140699f
commit ed944cd25b
1 changed files with 85 additions and 160 deletions
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@ -5213,7 +5213,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
    case PROJECTOR_TYPE_DEEPSEEKOCR:
        {
            const int native_resolutions[] = {
-                    512 /* tiny */, 640 /* small */, 1024 /* base */, 1280 /* large */
+                /* 512 tiny ,640  small ,*/ 1024 /* base */, 1280 /* large */
            };
            // original image size
            const int orig_w = original_size.width;
@ -5228,7 +5228,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
            int mode_i = 0;
            int min_diff = orig_area;

-                for (int i = 0; i < 4; i++) {
+            for (int i = 0; i < 2; i++) {
                int r = native_resolutions[i];
                if (std::abs(orig_area - r * r) < min_diff) {
                    mode_i = i;
@ -5236,24 +5236,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                }
            }

-                if (mode_i < 2) {
-                    /* Native Resolution (Tiny/Small) */
-                    const int image_size = native_resolutions[mode_i];
-
-                    // Just resize the image to image_size × image_size
-                    clip_image_u8_ptr resized_img(clip_image_u8_init());
-                    img_tool::resize(*img, *resized_img,
-                                    clip_image_size{image_size, image_size},
-                                    img_tool::RESIZE_ALGO_BICUBIC_PILLOW, false, color);  // Match PIL default
-
-                    clip_image_f32_ptr res(clip_image_f32_init());
-                    normalize_image_u8_to_f32(*resized_img, *res, params.image_mean, params.image_std);
-                    res_imgs->entries.push_back(std::move(res));
-
-                    res_imgs->grid_x = 1;
-                    res_imgs->grid_y = 1;
-                }
-                else if (mode_i < 4) {
            /* Native Resolution (Base/Large) */
            const int image_size = native_resolutions[mode_i];

@ -5281,7 +5263,8 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
            padded_img->buf.resize(image_size * image_size * 3); // black padding

            // Fill with mean color
-                    for (int i = 0; i < image_size * image_size; ++i) {
+            for (int i = 0; i < image_size * image_size; ++i)
+            {
                padded_img->buf[i * 3 + 0] = pad_r;
                padded_img->buf[i * 3 + 1] = pad_g;
                padded_img->buf[i * 3 + 2] = pad_b;
@ -5310,66 +5293,8 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
            res_imgs->grid_x = 1;
            res_imgs->grid_y = 1;
        }
-                else {
-                    GGML_ABORT("DeepSeek-OCR hasn't supported Gundam/Gundam-Master yet");
-                    /* Dynamic Resolution (Gundam/Gundam-Master) */
-
-                    // configurable, or read from params
-                    const int min_num    = 2;
-                    const int max_num    = 9;
-                    const int image_size = (mode_i == 4) ? 640 : 1024;
-
-                    // original image size
-                    const int orig_w = original_size.width;
-                    const int orig_h = original_size.height;
-
-                    // create overview image (thumbnail)
-                    clip_image_u8_ptr overview_img(clip_image_u8_init());
-                    img_tool::resize(*img, *overview_img, { image_size, image_size },
-                                     img_tool::RESIZE_ALGO_BICUBIC_PILLOW, true, color);
-                    clip_image_f32_ptr overview_f32(clip_image_f32_init());
-                    normalize_image_u8_to_f32(*overview_img, *overview_f32, params.image_mean, params.image_std);
-                    res_imgs->entries.push_back(std::move(overview_f32));
-
-                    // build candidate grids (cols, rows)
-                    auto target_ratios = ds_build_target_ratios(min_num, max_num);
-
-                    // pick the grid that best matches the original aspect ratio
-                    const float aspect_ratio = static_cast<float>(orig_w) / static_cast<float>(orig_h);
-                    auto best = ds_find_closest_ratio(aspect_ratio, target_ratios, orig_w, orig_h, image_size);
-                    const int grid_cols = best.first;   // how many tiles horizontally
-                    const int grid_rows = best.second;  // how many tiles vertically
-
-                    // resize to refined size (no padding, direct resize)
-                    clip_image_u8_ptr refined_img(clip_image_u8_init());
-                    img_tool::resize(*img, *refined_img, { image_size * grid_cols, image_size * grid_rows },
-                                     img_tool::RESIZE_ALGO_BICUBIC_PILLOW, false);
-
-                    // crop slices from the refined image
-                    for (int r = 0; r < grid_rows; ++r) {
-                        for (int c = 0; c < grid_cols; ++c) {
-                            const int x = c * image_size;
-                            const int y = r * image_size;
-
-                            // crop the slice
-                            clip_image_u8_ptr slice_img(clip_image_u8_init());
-                            img_tool::crop(*refined_img, *slice_img, x, y, image_size, image_size);
-
-                            // normalize and add to results
-                            clip_image_f32_ptr slice_f32(clip_image_f32_init());
-                            normalize_image_u8_to_f32(*slice_img, *slice_f32, params.image_mean, params.image_std);
-                            res_imgs->entries.push_back(std::move(slice_f32));
-                        }
-                    }
-
-                    // keep the grid info — the model may need to know how to reassemble / attend
-                    res_imgs->grid_x = grid_cols;
-                    res_imgs->grid_y = grid_rows;
-                }
-            }
        break;

-
        default:
            LOG_ERR("%s: unsupported projector type %d\n", __func__, ctx->proj_type());
            return false;