mtmd: remove --dsocr-mode argument
This commit is contained in:
parent
43dfc0c8d6
commit
b696c54756
|
|
@ -1829,21 +1829,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
params.image_max_tokens = value;
|
||||
}
|
||||
).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MAX_TOKENS"));
|
||||
add_opt(common_arg(
|
||||
{"--dsocr-mode"}, "MODE",
|
||||
"DeepSeek-OCR resolution mode, one of:\n"
|
||||
"- auto (default): automatically select resolution\n"
|
||||
"- tiny, small, base, large: native resolution\n"
|
||||
"- gundam, gundam-master: dynamic resolution",
|
||||
[](common_params & params, const std::string & value) {
|
||||
if (value == "auto" || value == "tiny" || value == "small" || value == "base" ||
|
||||
value == "large" || value == "gundam" || value == "gundam-master") {
|
||||
params.dsocr_mode = value;
|
||||
} else {
|
||||
throw std::invalid_argument("invalid value");
|
||||
}
|
||||
}
|
||||
).set_examples(mmproj_examples).set_env("LLAMA_ARG_DSOCR_MODE"));
|
||||
if (llama_supports_rpc()) {
|
||||
add_opt(common_arg(
|
||||
{"--rpc"}, "SERVERS",
|
||||
|
|
|
|||
|
|
@ -432,7 +432,6 @@ struct common_params {
|
|||
std::vector<std::string> image; // path to image file(s)
|
||||
int image_min_tokens = -1;
|
||||
int image_max_tokens = -1;
|
||||
std::string dsocr_mode = "auto"; // DeepSeek-OCR resolution mode: auto, tiny, small, base, large, gundam, gundam-master
|
||||
|
||||
// finetune
|
||||
struct lr_opt lr;
|
||||
|
|
|
|||
|
|
@ -206,9 +206,6 @@ struct clip_hparams {
|
|||
int32_t custom_image_min_tokens = -1;
|
||||
int32_t custom_image_max_tokens = -1;
|
||||
|
||||
// DeepSeek-OCR resolution mode
|
||||
enum clip_dsocr_mode dsocr_mode = clip_dsocr_mode::CLIP_DSOCR_MODE_AUTO;
|
||||
|
||||
void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) {
|
||||
const int cur_merge = n_merge == 0 ? 1 : n_merge;
|
||||
const int patch_area = patch_size * patch_size * cur_merge * cur_merge;
|
||||
|
|
@ -513,7 +510,6 @@ struct clip_ctx {
|
|||
if (ctx_params.image_max_tokens > 0) {
|
||||
model.hparams.custom_image_max_tokens = ctx_params.image_max_tokens;
|
||||
}
|
||||
model.hparams.dsocr_mode = ctx_params.dsocr_mode;
|
||||
|
||||
backend_ptrs.push_back(backend_cpu);
|
||||
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu));
|
||||
|
|
@ -5291,30 +5287,13 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
|||
}
|
||||
|
||||
int mode_i = 0;
|
||||
|
||||
if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_TINY) {
|
||||
mode_i = 0;
|
||||
} else if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_SMALL) {
|
||||
mode_i = 1;
|
||||
} else if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_BASE) {
|
||||
mode_i = 2;
|
||||
} else if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_LARGE) {
|
||||
mode_i = 3;
|
||||
} else if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_GUNDAM) {
|
||||
mode_i = 4;
|
||||
} else if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_GUNDAM_MASTER) {
|
||||
mode_i = 5;
|
||||
} else {
|
||||
if (params.dsocr_mode != clip_dsocr_mode::CLIP_DSOCR_MODE_AUTO) {
|
||||
LOG_WRN("%s: unknown dsocr_mode, using auto mode\n", __func__);
|
||||
}
|
||||
int min_diff = orig_area;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
int r = native_resolutions[i];
|
||||
if (std::abs(orig_area - r*r) < min_diff) {
|
||||
mode_i = i;
|
||||
min_diff = std::abs(orig_area - r*r);
|
||||
}
|
||||
int min_diff = orig_area;
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
int r = native_resolutions[i];
|
||||
if (std::abs(orig_area - r*r) < min_diff) {
|
||||
mode_i = i;
|
||||
min_diff = std::abs(orig_area - r*r);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -5393,7 +5372,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
|||
res_imgs->grid_y = 1;
|
||||
}
|
||||
else {
|
||||
GGML_ABORT("DeepSeek-OCR: Gundam/Gundam-Master haven't been tested yet.\n");
|
||||
GGML_ABORT("DeepSeek-OCR hasn't supported Gundam/Gundam-Master yet");
|
||||
/* Dynamic Resolution (Gundam/Gundam-Master) */
|
||||
|
||||
// configurable, or read from params
|
||||
|
|
|
|||
|
|
@ -29,23 +29,12 @@ enum clip_flash_attn_type {
|
|||
CLIP_FLASH_ATTN_TYPE_ENABLED = 1,
|
||||
};
|
||||
|
||||
enum clip_dsocr_mode {
|
||||
CLIP_DSOCR_MODE_AUTO,
|
||||
CLIP_DSOCR_MODE_TINY,
|
||||
CLIP_DSOCR_MODE_SMALL,
|
||||
CLIP_DSOCR_MODE_BASE,
|
||||
CLIP_DSOCR_MODE_LARGE,
|
||||
CLIP_DSOCR_MODE_GUNDAM,
|
||||
CLIP_DSOCR_MODE_GUNDAM_MASTER,
|
||||
};
|
||||
|
||||
struct clip_context_params {
|
||||
bool use_gpu;
|
||||
enum clip_flash_attn_type flash_attn_type;
|
||||
int image_min_tokens;
|
||||
int image_max_tokens;
|
||||
bool warmup;
|
||||
enum clip_dsocr_mode dsocr_mode;
|
||||
};
|
||||
|
||||
struct clip_init_result {
|
||||
|
|
|
|||
|
|
@ -139,7 +139,6 @@ struct mtmd_cli_context {
|
|||
mparams.warmup = params.warmup;
|
||||
mparams.image_min_tokens = params.image_min_tokens;
|
||||
mparams.image_max_tokens = params.image_max_tokens;
|
||||
mparams.dsocr_mode = params.dsocr_mode.c_str();
|
||||
ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
|
||||
if (!ctx_vision.get()) {
|
||||
LOG_ERR("Failed to load vision model from %s\n", clip_path);
|
||||
|
|
|
|||
|
|
@ -111,7 +111,6 @@ mtmd_context_params mtmd_context_params_default() {
|
|||
/* warmup */ true,
|
||||
/* image_min_tokens */ -1,
|
||||
/* image_max_tokens */ -1,
|
||||
/* dsocr_mode */ "auto",
|
||||
};
|
||||
return params;
|
||||
}
|
||||
|
|
@ -174,33 +173,12 @@ struct mtmd_context {
|
|||
throw std::runtime_error("media_marker must not be empty");
|
||||
}
|
||||
|
||||
enum clip_dsocr_mode dsocr_mode;
|
||||
|
||||
if (std::string(ctx_params.dsocr_mode) == "auto") {
|
||||
dsocr_mode = clip_dsocr_mode::CLIP_DSOCR_MODE_AUTO;
|
||||
} else if (std::string(ctx_params.dsocr_mode) == "tiny") {
|
||||
dsocr_mode = clip_dsocr_mode::CLIP_DSOCR_MODE_TINY;
|
||||
} else if (std::string(ctx_params.dsocr_mode) == "small") {
|
||||
dsocr_mode = clip_dsocr_mode::CLIP_DSOCR_MODE_SMALL;
|
||||
} else if (std::string(ctx_params.dsocr_mode) == "base") {
|
||||
dsocr_mode = clip_dsocr_mode::CLIP_DSOCR_MODE_BASE;
|
||||
} else if (std::string(ctx_params.dsocr_mode) == "large") {
|
||||
dsocr_mode = clip_dsocr_mode::CLIP_DSOCR_MODE_LARGE;
|
||||
} else if (std::string(ctx_params.dsocr_mode) == "gundam") {
|
||||
dsocr_mode = clip_dsocr_mode::CLIP_DSOCR_MODE_GUNDAM;
|
||||
} else if (std::string(ctx_params.dsocr_mode) == "gundam-master") {
|
||||
dsocr_mode = clip_dsocr_mode::CLIP_DSOCR_MODE_GUNDAM_MASTER;
|
||||
} else {
|
||||
throw std::invalid_argument("invalid value");
|
||||
}
|
||||
|
||||
clip_context_params ctx_clip_params {
|
||||
/* use_gpu */ ctx_params.use_gpu,
|
||||
/* flash_attn_type */ CLIP_FLASH_ATTN_TYPE_AUTO,
|
||||
/* image_min_tokens */ ctx_params.image_min_tokens,
|
||||
/* image_max_tokens */ ctx_params.image_max_tokens,
|
||||
/* warmup */ ctx_params.warmup,
|
||||
/* dsocr_mode */ dsocr_mode,
|
||||
};
|
||||
|
||||
auto res = clip_init(mmproj_fname, ctx_clip_params);
|
||||
|
|
|
|||
|
|
@ -87,9 +87,6 @@ struct mtmd_context_params {
|
|||
// limit number of image tokens, only for vision models with dynamic resolution
|
||||
int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)
|
||||
int image_max_tokens; // maximum number of tokens for image input (default: read from metadata)
|
||||
|
||||
// DeepSeek-OCR resolution mode
|
||||
const char * dsocr_mode; // one of: auto, tiny, small, base, large, gundam, gundam-master
|
||||
};
|
||||
|
||||
MTMD_API const char * mtmd_default_marker(void);
|
||||
|
|
|
|||
Loading…
Reference in New Issue