visual_model warmup (technically) works

2025-11-18 10:26:32 +01:00 · 2025-11-18 10:26:32 +01:00 · 89afda8da9
parent 63a042f21e
commit 89afda8da9
3 changed files with 9 additions and 1 deletions
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@ -5412,6 +5412,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        case PROJECTOR_TYPE_VOXTRAL:
        case PROJECTOR_TYPE_JANUS_PRO:
        case PROJECTOR_TYPE_COGVLM:
        case PROJECTOR_TYPE_DEEPSEEKOCR:
            {
                // do nothing
            } break;
@ -5554,6 +5555,10 @@ bool clip_is_gemma3(const struct clip_ctx * ctx) {
    return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3;
 }
 bool clip_is_deepseekocr(const struct clip_ctx * ctx) {
    return ctx->proj_type() == PROJECTOR_TYPE_DEEPSEEKOCR;
 }
 bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
    return ctx->model.modality == CLIP_MODALITY_VISION;
 }
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@ -105,6 +105,8 @@ bool clip_is_glm(const struct clip_ctx * ctx);
 bool clip_is_qwen2vl(const struct clip_ctx * ctx);
 bool clip_is_llava(const struct clip_ctx * ctx);
 bool clip_is_gemma3(const struct clip_ctx * ctx);
 bool clip_is_deepseekocr(const struct clip_ctx * ctx);
 bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@ -810,7 +810,8 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
    if (clip_is_llava(ctx_clip)
        || clip_is_minicpmv(ctx_clip)
-        || clip_is_glm(ctx_clip)) {
+        || clip_is_glm(ctx_clip)
        || clip_is_deepseekocr(ctx_clip)) {
        // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
        const auto & entries = image_tokens->batch_f32.entries;
        for (size_t i = 0; i < entries.size(); i++) {