diff --git a/common/arg.cpp b/common/arg.cpp index de80f293f4..52094e3f10 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1829,21 +1829,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.image_max_tokens = value; } ).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MAX_TOKENS")); - add_opt(common_arg( - {"--dsocr-mode"}, "MODE", - "DeepSeek-OCR resolution mode, one of:\n" - "- auto (default): automatically select resolution\n" - "- tiny, small, base, large: native resolution\n" - "- gundam, gundam-master: dynamic resolution", - [](common_params & params, const std::string & value) { - if (value == "auto" || value == "tiny" || value == "small" || value == "base" || - value == "large" || value == "gundam" || value == "gundam-master") { - params.dsocr_mode = value; - } else { - throw std::invalid_argument("invalid value"); - } - } - ).set_examples(mmproj_examples).set_env("LLAMA_ARG_DSOCR_MODE")); if (llama_supports_rpc()) { add_opt(common_arg( {"--rpc"}, "SERVERS", diff --git a/common/common.h b/common/common.h index ce1137c5bb..cdca5e26a2 100644 --- a/common/common.h +++ b/common/common.h @@ -432,7 +432,6 @@ struct common_params { std::vector image; // path to image file(s) int image_min_tokens = -1; int image_max_tokens = -1; - std::string dsocr_mode = "auto"; // DeepSeek-OCR resolution mode: auto, tiny, small, base, large, gundam, gundam-master // finetune struct lr_opt lr; diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 69c20ee0bc..ded8721199 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -206,9 +206,6 @@ struct clip_hparams { int32_t custom_image_min_tokens = -1; int32_t custom_image_max_tokens = -1; - // DeepSeek-OCR resolution mode - enum clip_dsocr_mode dsocr_mode = clip_dsocr_mode::CLIP_DSOCR_MODE_AUTO; - void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) { const int cur_merge = n_merge == 0 ? 1 : n_merge; const int patch_area = patch_size * patch_size * cur_merge * cur_merge; @@ -513,7 +510,6 @@ struct clip_ctx { if (ctx_params.image_max_tokens > 0) { model.hparams.custom_image_max_tokens = ctx_params.image_max_tokens; } - model.hparams.dsocr_mode = ctx_params.dsocr_mode; backend_ptrs.push_back(backend_cpu); backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu)); @@ -5291,30 +5287,13 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str } int mode_i = 0; - - if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_TINY) { - mode_i = 0; - } else if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_SMALL) { - mode_i = 1; - } else if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_BASE) { - mode_i = 2; - } else if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_LARGE) { - mode_i = 3; - } else if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_GUNDAM) { - mode_i = 4; - } else if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_GUNDAM_MASTER) { - mode_i = 5; - } else { - if (params.dsocr_mode != clip_dsocr_mode::CLIP_DSOCR_MODE_AUTO) { - LOG_WRN("%s: unknown dsocr_mode, using auto mode\n", __func__); - } - int min_diff = orig_area; - for (int i = 0; i < 4; i++) { - int r = native_resolutions[i]; - if (std::abs(orig_area - r*r) < min_diff) { - mode_i = i; - min_diff = std::abs(orig_area - r*r); - } + int min_diff = orig_area; + + for (int i = 0; i < 4; i++) { + int r = native_resolutions[i]; + if (std::abs(orig_area - r*r) < min_diff) { + mode_i = i; + min_diff = std::abs(orig_area - r*r); } } @@ -5393,7 +5372,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str res_imgs->grid_y = 1; } else { - GGML_ABORT("DeepSeek-OCR: Gundam/Gundam-Master haven't been tested yet.\n"); + GGML_ABORT("DeepSeek-OCR hasn't supported Gundam/Gundam-Master yet"); /* Dynamic Resolution (Gundam/Gundam-Master) */ // configurable, or read from params diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index 33689b76de..e4f6566e15 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -29,23 +29,12 @@ enum clip_flash_attn_type { CLIP_FLASH_ATTN_TYPE_ENABLED = 1, }; -enum clip_dsocr_mode { - CLIP_DSOCR_MODE_AUTO, - CLIP_DSOCR_MODE_TINY, - CLIP_DSOCR_MODE_SMALL, - CLIP_DSOCR_MODE_BASE, - CLIP_DSOCR_MODE_LARGE, - CLIP_DSOCR_MODE_GUNDAM, - CLIP_DSOCR_MODE_GUNDAM_MASTER, -}; - struct clip_context_params { bool use_gpu; enum clip_flash_attn_type flash_attn_type; int image_min_tokens; int image_max_tokens; bool warmup; - enum clip_dsocr_mode dsocr_mode; }; struct clip_init_result { diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index b9470d5c5c..ad38edc053 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -139,7 +139,6 @@ struct mtmd_cli_context { mparams.warmup = params.warmup; mparams.image_min_tokens = params.image_min_tokens; mparams.image_max_tokens = params.image_max_tokens; - mparams.dsocr_mode = params.dsocr_mode.c_str(); ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams)); if (!ctx_vision.get()) { LOG_ERR("Failed to load vision model from %s\n", clip_path); diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 57c2970967..2c20af099b 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -111,7 +111,6 @@ mtmd_context_params mtmd_context_params_default() { /* warmup */ true, /* image_min_tokens */ -1, /* image_max_tokens */ -1, - /* dsocr_mode */ "auto", }; return params; } @@ -174,33 +173,12 @@ struct mtmd_context { throw std::runtime_error("media_marker must not be empty"); } - enum clip_dsocr_mode dsocr_mode; - - if (std::string(ctx_params.dsocr_mode) == "auto") { - dsocr_mode = clip_dsocr_mode::CLIP_DSOCR_MODE_AUTO; - } else if (std::string(ctx_params.dsocr_mode) == "tiny") { - dsocr_mode = clip_dsocr_mode::CLIP_DSOCR_MODE_TINY; - } else if (std::string(ctx_params.dsocr_mode) == "small") { - dsocr_mode = clip_dsocr_mode::CLIP_DSOCR_MODE_SMALL; - } else if (std::string(ctx_params.dsocr_mode) == "base") { - dsocr_mode = clip_dsocr_mode::CLIP_DSOCR_MODE_BASE; - } else if (std::string(ctx_params.dsocr_mode) == "large") { - dsocr_mode = clip_dsocr_mode::CLIP_DSOCR_MODE_LARGE; - } else if (std::string(ctx_params.dsocr_mode) == "gundam") { - dsocr_mode = clip_dsocr_mode::CLIP_DSOCR_MODE_GUNDAM; - } else if (std::string(ctx_params.dsocr_mode) == "gundam-master") { - dsocr_mode = clip_dsocr_mode::CLIP_DSOCR_MODE_GUNDAM_MASTER; - } else { - throw std::invalid_argument("invalid value"); - } - clip_context_params ctx_clip_params { /* use_gpu */ ctx_params.use_gpu, /* flash_attn_type */ CLIP_FLASH_ATTN_TYPE_AUTO, /* image_min_tokens */ ctx_params.image_min_tokens, /* image_max_tokens */ ctx_params.image_max_tokens, /* warmup */ ctx_params.warmup, - /* dsocr_mode */ dsocr_mode, }; auto res = clip_init(mmproj_fname, ctx_clip_params); diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index 007c0e9a45..0c2d001db6 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -87,9 +87,6 @@ struct mtmd_context_params { // limit number of image tokens, only for vision models with dynamic resolution int image_min_tokens; // minimum number of tokens for image input (default: read from metadata) int image_max_tokens; // maximum number of tokens for image input (default: read from metadata) - - // DeepSeek-OCR resolution mode - const char * dsocr_mode; // one of: auto, tiny, small, base, large, gundam, gundam-master }; MTMD_API const char * mtmd_default_marker(void);