visual_model warmup (technically) works
This commit is contained in:
parent
63a042f21e
commit
89afda8da9
|
|
@ -5412,6 +5412,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
case PROJECTOR_TYPE_VOXTRAL:
|
case PROJECTOR_TYPE_VOXTRAL:
|
||||||
case PROJECTOR_TYPE_JANUS_PRO:
|
case PROJECTOR_TYPE_JANUS_PRO:
|
||||||
case PROJECTOR_TYPE_COGVLM:
|
case PROJECTOR_TYPE_COGVLM:
|
||||||
|
case PROJECTOR_TYPE_DEEPSEEKOCR:
|
||||||
{
|
{
|
||||||
// do nothing
|
// do nothing
|
||||||
} break;
|
} break;
|
||||||
|
|
@ -5554,6 +5555,10 @@ bool clip_is_gemma3(const struct clip_ctx * ctx) {
|
||||||
return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3;
|
return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool clip_is_deepseekocr(const struct clip_ctx * ctx) {
|
||||||
|
return ctx->proj_type() == PROJECTOR_TYPE_DEEPSEEKOCR;
|
||||||
|
}
|
||||||
|
|
||||||
bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
|
bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
|
||||||
return ctx->model.modality == CLIP_MODALITY_VISION;
|
return ctx->model.modality == CLIP_MODALITY_VISION;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -105,6 +105,8 @@ bool clip_is_glm(const struct clip_ctx * ctx);
|
||||||
bool clip_is_qwen2vl(const struct clip_ctx * ctx);
|
bool clip_is_qwen2vl(const struct clip_ctx * ctx);
|
||||||
bool clip_is_llava(const struct clip_ctx * ctx);
|
bool clip_is_llava(const struct clip_ctx * ctx);
|
||||||
bool clip_is_gemma3(const struct clip_ctx * ctx);
|
bool clip_is_gemma3(const struct clip_ctx * ctx);
|
||||||
|
bool clip_is_deepseekocr(const struct clip_ctx * ctx);
|
||||||
|
|
||||||
|
|
||||||
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
|
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -810,7 +810,8 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
|
||||||
|
|
||||||
if (clip_is_llava(ctx_clip)
|
if (clip_is_llava(ctx_clip)
|
||||||
|| clip_is_minicpmv(ctx_clip)
|
|| clip_is_minicpmv(ctx_clip)
|
||||||
|| clip_is_glm(ctx_clip)) {
|
|| clip_is_glm(ctx_clip)
|
||||||
|
|| clip_is_deepseekocr(ctx_clip)) {
|
||||||
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
|
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
|
||||||
const auto & entries = image_tokens->batch_f32.entries;
|
const auto & entries = image_tokens->batch_f32.entries;
|
||||||
for (size_t i = 0; i < entries.size(); i++) {
|
for (size_t i = 0; i < entries.size(); i++) {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue