mtmd: add mtmd_context_params::warmup option (#17652)
* mtmd: add mtmd_context_params::warmup option * reuse the common_params::warmup
This commit is contained in:
parent
00c361fe53
commit
ecf74a8417
|
|
@ -3526,15 +3526,19 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
|
||||||
ctx_vision = new clip_ctx(ctx_params);
|
ctx_vision = new clip_ctx(ctx_params);
|
||||||
loader.load_hparams(ctx_vision->model, CLIP_MODALITY_VISION);
|
loader.load_hparams(ctx_vision->model, CLIP_MODALITY_VISION);
|
||||||
loader.load_tensors(*ctx_vision);
|
loader.load_tensors(*ctx_vision);
|
||||||
|
if (ctx_params.warmup) {
|
||||||
loader.warmup(*ctx_vision);
|
loader.warmup(*ctx_vision);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (loader.has_audio) {
|
if (loader.has_audio) {
|
||||||
ctx_audio = new clip_ctx(ctx_params);
|
ctx_audio = new clip_ctx(ctx_params);
|
||||||
loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO);
|
loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO);
|
||||||
loader.load_tensors(*ctx_audio);
|
loader.load_tensors(*ctx_audio);
|
||||||
|
if (ctx_params.warmup) {
|
||||||
loader.warmup(*ctx_audio);
|
loader.warmup(*ctx_audio);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
} catch (const std::exception & e) {
|
} catch (const std::exception & e) {
|
||||||
LOG_ERR("%s: failed to load model '%s': %s\n", __func__, fname, e.what());
|
LOG_ERR("%s: failed to load model '%s': %s\n", __func__, fname, e.what());
|
||||||
|
|
|
||||||
|
|
@ -34,6 +34,7 @@ struct clip_context_params {
|
||||||
enum clip_flash_attn_type flash_attn_type;
|
enum clip_flash_attn_type flash_attn_type;
|
||||||
int image_min_tokens;
|
int image_min_tokens;
|
||||||
int image_max_tokens;
|
int image_max_tokens;
|
||||||
|
bool warmup;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct clip_init_result {
|
struct clip_init_result {
|
||||||
|
|
|
||||||
|
|
@ -136,6 +136,7 @@ struct mtmd_cli_context {
|
||||||
mparams.print_timings = true;
|
mparams.print_timings = true;
|
||||||
mparams.n_threads = params.cpuparams.n_threads;
|
mparams.n_threads = params.cpuparams.n_threads;
|
||||||
mparams.flash_attn_type = params.flash_attn_type;
|
mparams.flash_attn_type = params.flash_attn_type;
|
||||||
|
mparams.warmup = params.warmup;
|
||||||
mparams.image_min_tokens = params.image_min_tokens;
|
mparams.image_min_tokens = params.image_min_tokens;
|
||||||
mparams.image_max_tokens = params.image_max_tokens;
|
mparams.image_max_tokens = params.image_max_tokens;
|
||||||
ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
|
ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
|
||||||
|
|
|
||||||
|
|
@ -108,6 +108,7 @@ mtmd_context_params mtmd_context_params_default() {
|
||||||
/* image_marker */ MTMD_DEFAULT_IMAGE_MARKER,
|
/* image_marker */ MTMD_DEFAULT_IMAGE_MARKER,
|
||||||
/* media_marker */ mtmd_default_marker(),
|
/* media_marker */ mtmd_default_marker(),
|
||||||
/* flash_attn_type */ LLAMA_FLASH_ATTN_TYPE_AUTO,
|
/* flash_attn_type */ LLAMA_FLASH_ATTN_TYPE_AUTO,
|
||||||
|
/* warmup */ true,
|
||||||
/* image_min_tokens */ -1,
|
/* image_min_tokens */ -1,
|
||||||
/* image_max_tokens */ -1,
|
/* image_max_tokens */ -1,
|
||||||
};
|
};
|
||||||
|
|
@ -177,6 +178,7 @@ struct mtmd_context {
|
||||||
/* flash_attn_type */ CLIP_FLASH_ATTN_TYPE_AUTO,
|
/* flash_attn_type */ CLIP_FLASH_ATTN_TYPE_AUTO,
|
||||||
/* image_min_tokens */ ctx_params.image_min_tokens,
|
/* image_min_tokens */ ctx_params.image_min_tokens,
|
||||||
/* image_max_tokens */ ctx_params.image_max_tokens,
|
/* image_max_tokens */ ctx_params.image_max_tokens,
|
||||||
|
/* warmup */ ctx_params.warmup,
|
||||||
};
|
};
|
||||||
|
|
||||||
auto res = clip_init(mmproj_fname, ctx_clip_params);
|
auto res = clip_init(mmproj_fname, ctx_clip_params);
|
||||||
|
|
|
||||||
|
|
@ -82,6 +82,7 @@ struct mtmd_context_params {
|
||||||
const char * image_marker; // deprecated, use media_marker instead
|
const char * image_marker; // deprecated, use media_marker instead
|
||||||
const char * media_marker;
|
const char * media_marker;
|
||||||
enum llama_flash_attn_type flash_attn_type;
|
enum llama_flash_attn_type flash_attn_type;
|
||||||
|
bool warmup; // whether to run a warmup encode pass after initialization
|
||||||
|
|
||||||
// limit number of image tokens, only for vision models with dynamic resolution
|
// limit number of image tokens, only for vision models with dynamic resolution
|
||||||
int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)
|
int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)
|
||||||
|
|
|
||||||
|
|
@ -621,6 +621,7 @@ struct server_context_impl {
|
||||||
mparams.print_timings = false;
|
mparams.print_timings = false;
|
||||||
mparams.n_threads = params_base.cpuparams.n_threads;
|
mparams.n_threads = params_base.cpuparams.n_threads;
|
||||||
mparams.flash_attn_type = params_base.flash_attn_type;
|
mparams.flash_attn_type = params_base.flash_attn_type;
|
||||||
|
mparams.warmup = params_base.warmup;
|
||||||
mparams.image_min_tokens = params_base.image_min_tokens;
|
mparams.image_min_tokens = params_base.image_min_tokens;
|
||||||
mparams.image_max_tokens = params_base.image_max_tokens;
|
mparams.image_max_tokens = params_base.image_max_tokens;
|
||||||
mctx = mtmd_init_from_file(mmproj_path.c_str(), model, mparams);
|
mctx = mtmd_init_from_file(mmproj_path.c_str(), model, mparams);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue