diff --git a/common/arg.cpp b/common/arg.cpp
index de80f293f4..52094e3f10 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1829,21 +1829,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.image_max_tokens = value;
         }
     ).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MAX_TOKENS"));
-    add_opt(common_arg(
-        {"--dsocr-mode"}, "MODE",
-        "DeepSeek-OCR resolution mode, one of:\n"
-        "- auto (default): automatically select resolution\n"
-        "- tiny, small, base, large: native resolution\n"
-        "- gundam, gundam-master: dynamic resolution",
-        [](common_params & params, const std::string & value) {
-            if (value == "auto" || value == "tiny" || value == "small" || value == "base" ||
-                value == "large" || value == "gundam" || value == "gundam-master") {
-                params.dsocr_mode = value;
-            } else {
-                throw std::invalid_argument("invalid value");
-            }
-        }
-    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_DSOCR_MODE"));
     if (llama_supports_rpc()) {
         add_opt(common_arg(
             {"--rpc"}, "SERVERS",
diff --git a/common/common.h b/common/common.h
index ce1137c5bb..cdca5e26a2 100644
--- a/common/common.h
+++ b/common/common.h
@@ -432,7 +432,6 @@ struct common_params {
     std::vector<std::string> image; // path to image file(s)
     int image_min_tokens = -1;
     int image_max_tokens = -1;
-    std::string dsocr_mode = "auto"; // DeepSeek-OCR resolution mode: auto, tiny, small, base, large, gundam, gundam-master
 
     // finetune
     struct lr_opt lr;
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 69c20ee0bc..ded8721199 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -206,9 +206,6 @@ struct clip_hparams {
     int32_t custom_image_min_tokens = -1;
     int32_t custom_image_max_tokens = -1;
 
-    // DeepSeek-OCR resolution mode
-    enum clip_dsocr_mode dsocr_mode = clip_dsocr_mode::CLIP_DSOCR_MODE_AUTO;
-
     void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) {
         const int cur_merge = n_merge == 0 ? 1 : n_merge;
         const int patch_area = patch_size * patch_size * cur_merge * cur_merge;
@@ -513,7 +510,6 @@ struct clip_ctx {
         if (ctx_params.image_max_tokens > 0) {
             model.hparams.custom_image_max_tokens = ctx_params.image_max_tokens;
         }
-        model.hparams.dsocr_mode = ctx_params.dsocr_mode;
 
         backend_ptrs.push_back(backend_cpu);
         backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu));
@@ -5291,30 +5287,13 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                 }
     
                 int mode_i = 0;
-    
-                if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_TINY) {
-                    mode_i = 0;
-                } else if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_SMALL) {
-                    mode_i = 1;
-                } else if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_BASE) {
-                    mode_i = 2;
-                } else if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_LARGE) {
-                    mode_i = 3;
-                } else if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_GUNDAM) {
-                    mode_i = 4;
-                } else if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_GUNDAM_MASTER) {
-                    mode_i = 5;
-                } else {
-                    if (params.dsocr_mode != clip_dsocr_mode::CLIP_DSOCR_MODE_AUTO) {
-                        LOG_WRN("%s: unknown dsocr_mode, using auto mode\n", __func__);
-                    }
-                    int min_diff = orig_area;
-                    for (int i = 0; i < 4; i++) {
-                        int r = native_resolutions[i];
-                        if (std::abs(orig_area - r*r) < min_diff) {
-                            mode_i = i;
-                            min_diff = std::abs(orig_area - r*r);
-                        }
+                int min_diff = orig_area;
+
+                for (int i = 0; i < 4; i++) {
+                    int r = native_resolutions[i];
+                    if (std::abs(orig_area - r*r) < min_diff) {
+                        mode_i = i;
+                        min_diff = std::abs(orig_area - r*r);
                     }
                 }
 
@@ -5393,7 +5372,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                     res_imgs->grid_y = 1;
                 }
                 else {
-                    GGML_ABORT("DeepSeek-OCR: Gundam/Gundam-Master haven't been tested yet.\n");
+                    GGML_ABORT("DeepSeek-OCR hasn't supported Gundam/Gundam-Master yet");
                     /* Dynamic Resolution (Gundam/Gundam-Master) */
     
                     // configurable, or read from params
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
index 33689b76de..e4f6566e15 100644
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -29,23 +29,12 @@ enum clip_flash_attn_type {
     CLIP_FLASH_ATTN_TYPE_ENABLED  = 1,
 };
 
-enum clip_dsocr_mode {
-    CLIP_DSOCR_MODE_AUTO,
-    CLIP_DSOCR_MODE_TINY,
-    CLIP_DSOCR_MODE_SMALL,
-    CLIP_DSOCR_MODE_BASE,
-    CLIP_DSOCR_MODE_LARGE,
-    CLIP_DSOCR_MODE_GUNDAM,
-    CLIP_DSOCR_MODE_GUNDAM_MASTER,
-};
-
 struct clip_context_params {
     bool use_gpu;
     enum clip_flash_attn_type flash_attn_type;
     int image_min_tokens;
     int image_max_tokens;
     bool warmup;
-    enum clip_dsocr_mode dsocr_mode;
 };
 
 struct clip_init_result {
diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp
index b9470d5c5c..ad38edc053 100644
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@@ -139,7 +139,6 @@ struct mtmd_cli_context {
         mparams.warmup           = params.warmup;
         mparams.image_min_tokens = params.image_min_tokens;
         mparams.image_max_tokens = params.image_max_tokens;
-        mparams.dsocr_mode       = params.dsocr_mode.c_str();
         ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
         if (!ctx_vision.get()) {
             LOG_ERR("Failed to load vision model from %s\n", clip_path);
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 57c2970967..2c20af099b 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -111,7 +111,6 @@ mtmd_context_params mtmd_context_params_default() {
         /* warmup            */ true,
         /* image_min_tokens  */ -1,
         /* image_max_tokens  */ -1,
-        /* dsocr_mode        */ "auto",
     };
     return params;
 }
@@ -174,33 +173,12 @@ struct mtmd_context {
             throw std::runtime_error("media_marker must not be empty");
         }
 
-        enum clip_dsocr_mode dsocr_mode;
-
-        if (std::string(ctx_params.dsocr_mode) == "auto") {
-            dsocr_mode = clip_dsocr_mode::CLIP_DSOCR_MODE_AUTO;
-        } else if (std::string(ctx_params.dsocr_mode) == "tiny") {
-            dsocr_mode = clip_dsocr_mode::CLIP_DSOCR_MODE_TINY;
-        } else if (std::string(ctx_params.dsocr_mode) == "small") {
-            dsocr_mode = clip_dsocr_mode::CLIP_DSOCR_MODE_SMALL;
-        } else if (std::string(ctx_params.dsocr_mode) == "base") {
-            dsocr_mode = clip_dsocr_mode::CLIP_DSOCR_MODE_BASE;
-        } else if (std::string(ctx_params.dsocr_mode) == "large") {
-            dsocr_mode = clip_dsocr_mode::CLIP_DSOCR_MODE_LARGE;
-        } else if (std::string(ctx_params.dsocr_mode) == "gundam") {
-            dsocr_mode = clip_dsocr_mode::CLIP_DSOCR_MODE_GUNDAM;
-        } else if (std::string(ctx_params.dsocr_mode) == "gundam-master") {
-            dsocr_mode = clip_dsocr_mode::CLIP_DSOCR_MODE_GUNDAM_MASTER;
-        } else {
-            throw std::invalid_argument("invalid value");
-        }
-
         clip_context_params ctx_clip_params {
             /* use_gpu           */ ctx_params.use_gpu,
             /* flash_attn_type   */ CLIP_FLASH_ATTN_TYPE_AUTO,
             /* image_min_tokens  */ ctx_params.image_min_tokens,
             /* image_max_tokens  */ ctx_params.image_max_tokens,
             /* warmup            */ ctx_params.warmup,
-            /* dsocr_mode        */ dsocr_mode,
         };
 
         auto res = clip_init(mmproj_fname, ctx_clip_params);
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
index 007c0e9a45..0c2d001db6 100644
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@@ -87,9 +87,6 @@ struct mtmd_context_params {
     // limit number of image tokens, only for vision models with dynamic resolution
     int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)
     int image_max_tokens; // maximum number of tokens for image input (default: read from metadata)
-
-    // DeepSeek-OCR resolution mode
-    const char * dsocr_mode; // one of: auto, tiny, small, base, large, gundam, gundam-master
 };
 
 MTMD_API const char * mtmd_default_marker(void);