From 89afda8da90024aaf908448a2bb8dafee739934c Mon Sep 17 00:00:00 2001 From: Saba Fallah <10401143+sfallah@users.noreply.github.com> Date: Tue, 18 Nov 2025 10:26:32 +0100 Subject: [PATCH] visual_model warmup (technically) works --- tools/mtmd/clip.cpp | 5 +++++ tools/mtmd/clip.h | 2 ++ tools/mtmd/mtmd.cpp | 3 ++- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 99b5ab45d9..797f921f50 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -5412,6 +5412,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima case PROJECTOR_TYPE_VOXTRAL: case PROJECTOR_TYPE_JANUS_PRO: case PROJECTOR_TYPE_COGVLM: + case PROJECTOR_TYPE_DEEPSEEKOCR: { // do nothing } break; @@ -5554,6 +5555,10 @@ bool clip_is_gemma3(const struct clip_ctx * ctx) { return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3; } +bool clip_is_deepseekocr(const struct clip_ctx * ctx) { + return ctx->proj_type() == PROJECTOR_TYPE_DEEPSEEKOCR; +} + bool clip_has_vision_encoder(const struct clip_ctx * ctx) { return ctx->model.modality == CLIP_MODALITY_VISION; } diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index 3e4c985f11..458ee98fc7 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -105,6 +105,8 @@ bool clip_is_glm(const struct clip_ctx * ctx); bool clip_is_qwen2vl(const struct clip_ctx * ctx); bool clip_is_llava(const struct clip_ctx * ctx); bool clip_is_gemma3(const struct clip_ctx * ctx); +bool clip_is_deepseekocr(const struct clip_ctx * ctx); + bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec); diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index e599137769..16349e8f40 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -810,7 +810,8 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) if (clip_is_llava(ctx_clip) || clip_is_minicpmv(ctx_clip) - || clip_is_glm(ctx_clip)) { + || clip_is_glm(ctx_clip) + || clip_is_deepseekocr(ctx_clip)) { // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode() const auto & entries = image_tokens->batch_f32.entries; for (size_t i = 0; i < entries.size(); i++) {