visual_model warmup (technically) works

This commit is contained in:
Saba Fallah 2025-11-18 10:26:32 +01:00
parent 63a042f21e
commit 89afda8da9
3 changed files with 9 additions and 1 deletions

View File

@ -5412,6 +5412,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
case PROJECTOR_TYPE_VOXTRAL: case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_JANUS_PRO: case PROJECTOR_TYPE_JANUS_PRO:
case PROJECTOR_TYPE_COGVLM: case PROJECTOR_TYPE_COGVLM:
case PROJECTOR_TYPE_DEEPSEEKOCR:
{ {
// do nothing // do nothing
} break; } break;
@ -5554,6 +5555,10 @@ bool clip_is_gemma3(const struct clip_ctx * ctx) {
return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3; return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3;
} }
bool clip_is_deepseekocr(const struct clip_ctx * ctx) {
return ctx->proj_type() == PROJECTOR_TYPE_DEEPSEEKOCR;
}
bool clip_has_vision_encoder(const struct clip_ctx * ctx) { bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
return ctx->model.modality == CLIP_MODALITY_VISION; return ctx->model.modality == CLIP_MODALITY_VISION;
} }

View File

@ -105,6 +105,8 @@ bool clip_is_glm(const struct clip_ctx * ctx);
bool clip_is_qwen2vl(const struct clip_ctx * ctx); bool clip_is_qwen2vl(const struct clip_ctx * ctx);
bool clip_is_llava(const struct clip_ctx * ctx); bool clip_is_llava(const struct clip_ctx * ctx);
bool clip_is_gemma3(const struct clip_ctx * ctx); bool clip_is_gemma3(const struct clip_ctx * ctx);
bool clip_is_deepseekocr(const struct clip_ctx * ctx);
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec); bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);

View File

@ -810,7 +810,8 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
if (clip_is_llava(ctx_clip) if (clip_is_llava(ctx_clip)
|| clip_is_minicpmv(ctx_clip) || clip_is_minicpmv(ctx_clip)
|| clip_is_glm(ctx_clip)) { || clip_is_glm(ctx_clip)
|| clip_is_deepseekocr(ctx_clip)) {
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode() // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
const auto & entries = image_tokens->batch_f32.entries; const auto & entries = image_tokens->batch_f32.entries;
for (size_t i = 0; i < entries.size(); i++) { for (size_t i = 0; i < entries.size(); i++) {