From ecf74a841755fdf468d270a8727b6ed103d13344 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Mon, 1 Dec 2025 21:32:25 +0100 Subject: [PATCH] mtmd: add mtmd_context_params::warmup option (#17652) * mtmd: add mtmd_context_params::warmup option * reuse the common_params::warmup --- tools/mtmd/clip.cpp | 8 ++++++-- tools/mtmd/clip.h | 1 + tools/mtmd/mtmd-cli.cpp | 1 + tools/mtmd/mtmd.cpp | 2 ++ tools/mtmd/mtmd.h | 1 + tools/server/server-context.cpp | 1 + 6 files changed, 12 insertions(+), 2 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index d8222d8814..ea89259f92 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -3526,14 +3526,18 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_vision = new clip_ctx(ctx_params); loader.load_hparams(ctx_vision->model, CLIP_MODALITY_VISION); loader.load_tensors(*ctx_vision); - loader.warmup(*ctx_vision); + if (ctx_params.warmup) { + loader.warmup(*ctx_vision); + } } if (loader.has_audio) { ctx_audio = new clip_ctx(ctx_params); loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO); loader.load_tensors(*ctx_audio); - loader.warmup(*ctx_audio); + if (ctx_params.warmup) { + loader.warmup(*ctx_audio); + } } } catch (const std::exception & e) { diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index c1442afe6b..e8aeb2066c 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -34,6 +34,7 @@ struct clip_context_params { enum clip_flash_attn_type flash_attn_type; int image_min_tokens; int image_max_tokens; + bool warmup; }; struct clip_init_result { diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index 6679de309b..b5bbc6536b 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -136,6 +136,7 @@ struct mtmd_cli_context { mparams.print_timings = true; mparams.n_threads = params.cpuparams.n_threads; mparams.flash_attn_type = params.flash_attn_type; + mparams.warmup = params.warmup; mparams.image_min_tokens = params.image_min_tokens; mparams.image_max_tokens = params.image_max_tokens; ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams)); diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 6690bf3004..d06fa42e61 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -108,6 +108,7 @@ mtmd_context_params mtmd_context_params_default() { /* image_marker */ MTMD_DEFAULT_IMAGE_MARKER, /* media_marker */ mtmd_default_marker(), /* flash_attn_type */ LLAMA_FLASH_ATTN_TYPE_AUTO, + /* warmup */ true, /* image_min_tokens */ -1, /* image_max_tokens */ -1, }; @@ -177,6 +178,7 @@ struct mtmd_context { /* flash_attn_type */ CLIP_FLASH_ATTN_TYPE_AUTO, /* image_min_tokens */ ctx_params.image_min_tokens, /* image_max_tokens */ ctx_params.image_max_tokens, + /* warmup */ ctx_params.warmup, }; auto res = clip_init(mmproj_fname, ctx_clip_params); diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index 015119be89..b3df24c299 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -82,6 +82,7 @@ struct mtmd_context_params { const char * image_marker; // deprecated, use media_marker instead const char * media_marker; enum llama_flash_attn_type flash_attn_type; + bool warmup; // whether to run a warmup encode pass after initialization // limit number of image tokens, only for vision models with dynamic resolution int image_min_tokens; // minimum number of tokens for image input (default: read from metadata) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 2bf3924df9..e992db70f1 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -621,6 +621,7 @@ struct server_context_impl { mparams.print_timings = false; mparams.n_threads = params_base.cpuparams.n_threads; mparams.flash_attn_type = params_base.flash_attn_type; + mparams.warmup = params_base.warmup; mparams.image_min_tokens = params_base.image_min_tokens; mparams.image_max_tokens = params_base.image_max_tokens; mctx = mtmd_init_from_file(mmproj_path.c_str(), model, mparams);