mtmd : add --dsocr-mode CLI argument for DeepSeek-OCR resolution control & all native resolution modes work

This commit is contained in:
bluebread 2025-11-30 16:57:19 +00:00
parent 55430945ef
commit c5f4c64fe4
9 changed files with 159 additions and 88 deletions

View File

@ -1824,6 +1824,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.image_max_tokens = value; params.image_max_tokens = value;
} }
).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MAX_TOKENS")); ).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MAX_TOKENS"));
add_opt(common_arg(
{"--dsocr-mode"}, "MODE",
"DeepSeek-OCR resolution mode, one of:\n"
"- auto (default): automatically select resolution\n"
"- tiny, small, base, large: native resolution\n"
"- gundam, gundam-master: dynamic resolution",
[](common_params & params, const std::string & value) {
if (value == "auto" || value == "tiny" || value == "small" || value == "base" ||
value == "large" || value == "gundam" || value == "gundam-master") {
params.dsocr_mode = value;
} else {
throw std::invalid_argument("invalid value");
}
}
).set_examples(mmproj_examples).set_env("LLAMA_ARG_DSOCR_MODE"));
if (llama_supports_rpc()) { if (llama_supports_rpc()) {
add_opt(common_arg( add_opt(common_arg(
{"--rpc"}, "SERVERS", {"--rpc"}, "SERVERS",

View File

@ -433,6 +433,7 @@ struct common_params {
std::vector<std::string> image; // path to image file(s) std::vector<std::string> image; // path to image file(s)
int image_min_tokens = -1; int image_min_tokens = -1;
int image_max_tokens = -1; int image_max_tokens = -1;
std::string dsocr_mode = "auto"; // DeepSeek-OCR resolution mode: auto, tiny, small, base, large, gundam, gundam-master
// finetune // finetune
struct lr_opt lr; struct lr_opt lr;

View File

@ -214,5 +214,7 @@ void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
upscale_f32_bicubic_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], upscale_f32_bicubic_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
src0->ne[0], src0->ne[1], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], src0->ne[0], src0->ne[1], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
sf0, sf1, sf2, sf3, pixel_offset, stream); sf0, sf1, sf2, sf3, pixel_offset, stream);
} else {
GGML_ABORT("fatal error");
} }
} }

View File

@ -569,7 +569,7 @@ static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) {
printf(" ]\n"); printf(" ]\n");
} }
static void save_tensor_to_file(const struct ggml_tensor * tensor) { static void save_tensor_to_file(const struct ggml_tensor * tensor, const uint8_t * data_ptr) {
char filename[512]; char filename[512];
snprintf(filename, sizeof(filename), "%s_cpp.txt", tensor->name); snprintf(filename, sizeof(filename), "%s_cpp.txt", tensor->name);
@ -589,7 +589,7 @@ static void save_tensor_to_file(const struct ggml_tensor * tensor) {
(long long)total_elements); (long long)total_elements);
} }
uint8_t * data = (uint8_t *) tensor->data; const uint8_t * data = (data_ptr) ? data_ptr : (uint8_t *) tensor->data;
ggml_type type = tensor->type; ggml_type type = tensor->type;
const int64_t * ne = tensor->ne; const int64_t * ne = tensor->ne;
const size_t * nb = tensor->nb; const size_t * nb = tensor->nb;

View File

@ -193,8 +193,6 @@ struct clip_hparams {
int32_t attn_window_size = 0; int32_t attn_window_size = 0;
int32_t n_wa_pattern = 0; int32_t n_wa_pattern = 0;
bool crop_mode = false;
// audio // audio
int32_t n_mel_bins = 0; // whisper preprocessor int32_t n_mel_bins = 0; // whisper preprocessor
int32_t proj_stack_factor = 0; // ultravox int32_t proj_stack_factor = 0; // ultravox
@ -208,6 +206,9 @@ struct clip_hparams {
int32_t custom_image_min_tokens = -1; int32_t custom_image_min_tokens = -1;
int32_t custom_image_max_tokens = -1; int32_t custom_image_max_tokens = -1;
// DeepSeek-OCR resolution mode
enum clip_dsocr_mode dsocr_mode = clip_dsocr_mode::CLIP_DSOCR_MODE_AUTO;
void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) { void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) {
const int cur_merge = n_merge == 0 ? 1 : n_merge; const int cur_merge = n_merge == 0 ? 1 : n_merge;
const int patch_area = patch_size * patch_size * cur_merge * cur_merge; const int patch_area = patch_size * patch_size * cur_merge * cur_merge;
@ -512,6 +513,7 @@ struct clip_ctx {
if (ctx_params.image_max_tokens > 0) { if (ctx_params.image_max_tokens > 0) {
model.hparams.custom_image_max_tokens = ctx_params.image_max_tokens; model.hparams.custom_image_max_tokens = ctx_params.image_max_tokens;
} }
model.hparams.dsocr_mode = ctx_params.dsocr_mode;
backend_ptrs.push_back(backend_cpu); backend_ptrs.push_back(backend_cpu);
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu)); backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu));
@ -3403,7 +3405,6 @@ struct clip_model_loader {
hparams.patch_size = 16; hparams.patch_size = 16;
hparams.image_size = 1024; hparams.image_size = 1024;
hparams.warmup_image_size = 1024; hparams.warmup_image_size = 1024;
hparams.crop_mode = false;
} break; } break;
default: default:
break; break;
@ -5054,9 +5055,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
} }
} break; } break;
case PROJECTOR_TYPE_DEEPSEEKOCR: case PROJECTOR_TYPE_DEEPSEEKOCR:
if (!params.crop_mode) { {
/* Native Resolution (Tiny/Small/Base/Large) */
const int native_resolutions[] = { const int native_resolutions[] = {
512 /* tiny */, 640 /* small */, 1024 /* base */, 1280 /* large */ 512 /* tiny */, 640 /* small */, 1024 /* base */, 1280 /* large */
}; };
@ -5070,10 +5069,25 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
color[i] = (int)(255 * params.image_mean[i]); color[i] = (int)(255 * params.image_mean[i]);
} }
// mode selection logic (find most suitable resolution)
int mode_i = 0; int mode_i = 0;
int min_diff = orig_area;
if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_TINY) {
mode_i = 0;
} else if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_SMALL) {
mode_i = 1;
} else if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_BASE) {
mode_i = 2;
} else if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_LARGE) {
mode_i = 3;
} else if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_GUNDAM) {
mode_i = 4;
} else if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_GUNDAM_MASTER) {
mode_i = 5;
} else {
if (params.dsocr_mode != clip_dsocr_mode::CLIP_DSOCR_MODE_AUTO) {
LOG_WRN("%s: unknown dsocr_mode, using auto mode\n", __func__);
}
int min_diff = orig_area;
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {
int r = native_resolutions[i]; int r = native_resolutions[i];
if (std::abs(orig_area - r*r) < min_diff) { if (std::abs(orig_area - r*r) < min_diff) {
@ -5081,13 +5095,13 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
min_diff = std::abs(orig_area - r*r); min_diff = std::abs(orig_area - r*r);
} }
} }
}
const int image_size = native_resolutions[mode_i];
if (mode_i < 2) { if (mode_i < 2) {
// TINY/SMALL MODE: Direct resize (no slicing) /* Native Resolution (Tiny/Small) */
// Just resize the image to image_size × image_size const int image_size = native_resolutions[mode_i];
// Just resize the image to image_size × image_size
clip_image_u8_ptr resized_img(clip_image_u8_init()); clip_image_u8_ptr resized_img(clip_image_u8_init());
img_tool::resize(*img, *resized_img, img_tool::resize(*img, *resized_img,
clip_image_size{image_size, image_size}, clip_image_size{image_size, image_size},
@ -5100,10 +5114,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
res_imgs->grid_x = 1; res_imgs->grid_x = 1;
res_imgs->grid_y = 1; res_imgs->grid_y = 1;
} }
else { else if (mode_i < 4) {
// BASE/LARGE MODE: Resize with aspect ratio + padding /* Native Resolution (Base/Large) */
// Resize maintaining aspect ratio, then pad to square const int image_size = native_resolutions[mode_i];
// Resize maintaining aspect ratio, then pad to square
float scale = std::min( float scale = std::min(
static_cast<float>(image_size) / orig_w, static_cast<float>(image_size) / orig_w,
static_cast<float>(image_size) / orig_h static_cast<float>(image_size) / orig_h
@ -5120,7 +5135,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
unsigned char pad_g = static_cast<unsigned char>(params.image_mean[1] * 255.0f); unsigned char pad_g = static_cast<unsigned char>(params.image_mean[1] * 255.0f);
unsigned char pad_b = static_cast<unsigned char>(params.image_mean[2] * 255.0f); unsigned char pad_b = static_cast<unsigned char>(params.image_mean[2] * 255.0f);
// Step 2: Pad to image_size × image_size (center padding) // Pad to image_size × image_size (center padding)
clip_image_u8_ptr padded_img(clip_image_u8_init()); clip_image_u8_ptr padded_img(clip_image_u8_init());
padded_img->nx = image_size; padded_img->nx = image_size;
padded_img->ny = image_size; padded_img->ny = image_size;
@ -5148,7 +5163,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
} }
} }
// Step 3: Normalize and output // Normalize and output
clip_image_f32_ptr res(clip_image_f32_init()); clip_image_f32_ptr res(clip_image_f32_init());
normalize_image_u8_to_f32(*padded_img, *res, params.image_mean, params.image_std); normalize_image_u8_to_f32(*padded_img, *res, params.image_mean, params.image_std);
res_imgs->entries.push_back(std::move(res)); res_imgs->entries.push_back(std::move(res));
@ -5156,9 +5171,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
res_imgs->grid_x = 1; res_imgs->grid_x = 1;
res_imgs->grid_y = 1; res_imgs->grid_y = 1;
} }
}
else { else {
/* Dynamic Resolution (Gundam/Gundam-M) */ GGML_ABORT("DeepSeek-OCR: Gundam/Gundam-Master haven't been tested yet.\n");
/* Dynamic Resolution (Gundam/Gundam-Master) */
// configurable, or read from params // configurable, or read from params
const int min_num = 2; const int min_num = 2;
@ -5219,6 +5234,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
res_imgs->grid_x = grid_cols; res_imgs->grid_x = grid_cols;
res_imgs->grid_y = grid_rows; res_imgs->grid_y = grid_rows;
} }
}
break; break;
@ -5807,7 +5823,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
for (auto & p : patterns) { for (auto & p : patterns) {
if (tname_s == p) { if (tname_s == p) {
save_tensor_to_file(t); save_tensor_to_file(t, data.data());
is_stored = true; is_stored = true;
break; break;
} }

View File

@ -29,11 +29,22 @@ enum clip_flash_attn_type {
CLIP_FLASH_ATTN_TYPE_ENABLED = 1, CLIP_FLASH_ATTN_TYPE_ENABLED = 1,
}; };
enum clip_dsocr_mode {
CLIP_DSOCR_MODE_AUTO,
CLIP_DSOCR_MODE_TINY,
CLIP_DSOCR_MODE_SMALL,
CLIP_DSOCR_MODE_BASE,
CLIP_DSOCR_MODE_LARGE,
CLIP_DSOCR_MODE_GUNDAM,
CLIP_DSOCR_MODE_GUNDAM_MASTER,
};
struct clip_context_params { struct clip_context_params {
bool use_gpu; bool use_gpu;
enum clip_flash_attn_type flash_attn_type; enum clip_flash_attn_type flash_attn_type;
int image_min_tokens; int image_min_tokens;
int image_max_tokens; int image_max_tokens;
enum clip_dsocr_mode dsocr_mode;
}; };
struct clip_init_result { struct clip_init_result {

View File

@ -138,6 +138,7 @@ struct mtmd_cli_context {
mparams.flash_attn_type = params.flash_attn_type; mparams.flash_attn_type = params.flash_attn_type;
mparams.image_min_tokens = params.image_min_tokens; mparams.image_min_tokens = params.image_min_tokens;
mparams.image_max_tokens = params.image_max_tokens; mparams.image_max_tokens = params.image_max_tokens;
mparams.dsocr_mode = params.dsocr_mode.c_str();
ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams)); ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
if (!ctx_vision.get()) { if (!ctx_vision.get()) {
LOG_ERR("Failed to load vision model from %s\n", clip_path); LOG_ERR("Failed to load vision model from %s\n", clip_path);

View File

@ -110,6 +110,7 @@ mtmd_context_params mtmd_context_params_default() {
/* flash_attn_type */ LLAMA_FLASH_ATTN_TYPE_AUTO, /* flash_attn_type */ LLAMA_FLASH_ATTN_TYPE_AUTO,
/* image_min_tokens */ -1, /* image_min_tokens */ -1,
/* image_max_tokens */ -1, /* image_max_tokens */ -1,
/* dsocr_mode */ "auto",
}; };
return params; return params;
} }
@ -172,11 +173,32 @@ struct mtmd_context {
throw std::runtime_error("media_marker must not be empty"); throw std::runtime_error("media_marker must not be empty");
} }
enum clip_dsocr_mode dsocr_mode;
if (std::string(ctx_params.dsocr_mode) == "auto") {
dsocr_mode = clip_dsocr_mode::CLIP_DSOCR_MODE_AUTO;
} else if (std::string(ctx_params.dsocr_mode) == "tiny") {
dsocr_mode = clip_dsocr_mode::CLIP_DSOCR_MODE_TINY;
} else if (std::string(ctx_params.dsocr_mode) == "small") {
dsocr_mode = clip_dsocr_mode::CLIP_DSOCR_MODE_SMALL;
} else if (std::string(ctx_params.dsocr_mode) == "base") {
dsocr_mode = clip_dsocr_mode::CLIP_DSOCR_MODE_BASE;
} else if (std::string(ctx_params.dsocr_mode) == "large") {
dsocr_mode = clip_dsocr_mode::CLIP_DSOCR_MODE_LARGE;
} else if (std::string(ctx_params.dsocr_mode) == "gundam") {
dsocr_mode = clip_dsocr_mode::CLIP_DSOCR_MODE_GUNDAM;
} else if (std::string(ctx_params.dsocr_mode) == "gundam-master") {
dsocr_mode = clip_dsocr_mode::CLIP_DSOCR_MODE_GUNDAM_MASTER;
} else {
throw std::invalid_argument("invalid value");
}
clip_context_params ctx_clip_params { clip_context_params ctx_clip_params {
/* use_gpu */ ctx_params.use_gpu, /* use_gpu */ ctx_params.use_gpu,
/* flash_attn_type */ CLIP_FLASH_ATTN_TYPE_AUTO, /* flash_attn_type */ CLIP_FLASH_ATTN_TYPE_AUTO,
/* image_min_tokens */ ctx_params.image_min_tokens, /* image_min_tokens */ ctx_params.image_min_tokens,
/* image_max_tokens */ ctx_params.image_max_tokens, /* image_max_tokens */ ctx_params.image_max_tokens,
/* dsocr_mode */ dsocr_mode,
}; };
auto res = clip_init(mmproj_fname, ctx_clip_params); auto res = clip_init(mmproj_fname, ctx_clip_params);

View File

@ -86,6 +86,9 @@ struct mtmd_context_params {
// limit number of image tokens, only for vision models with dynamic resolution // limit number of image tokens, only for vision models with dynamic resolution
int image_min_tokens; // minimum number of tokens for image input (default: read from metadata) int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)
int image_max_tokens; // maximum number of tokens for image input (default: read from metadata) int image_max_tokens; // maximum number of tokens for image input (default: read from metadata)
// DeepSeek-OCR resolution mode
const char * dsocr_mode; // one of: auto, tiny, small, base, large, gundam, gundam-master
}; };
MTMD_API const char * mtmd_default_marker(void); MTMD_API const char * mtmd_default_marker(void);