From 9e39a1e6a991331bfa02390784eaa1ea226f1d4b Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Sat, 20 Dec 2025 09:25:27 +0100 Subject: [PATCH] server: support load model on startup, support preset-only options (#18206) * server: support autoload model, support preset-only options * add docs * load-on-startup * fix * Update common/arg.cpp Co-authored-by: Pascal --------- Co-authored-by: Pascal --- common/arg.cpp | 26 ++++++++++++++++++++++++++ common/arg.h | 11 ++++++++++- common/preset.cpp | 15 ++++++++++++--- tests/test-arg-parser.cpp | 1 + tools/server/README.md | 3 +++ tools/server/server-models.cpp | 20 ++++++++++++++++++++ tools/server/server-models.h | 14 ++++++++------ 7 files changed, 80 insertions(+), 10 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index ae3b8b46ca..476bc0084a 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -96,6 +96,11 @@ common_arg & common_arg::set_sparam() { return *this; } +common_arg & common_arg::set_preset_only() { + is_preset_only = true; + return *this; +} + bool common_arg::in_example(enum llama_example ex) { return examples.find(ex) != examples.end(); } @@ -3494,3 +3499,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex return ctx_arg; } + +void common_params_add_preset_options(std::vector & args) { + // arguments below won't be treated as CLI args, only preset options + args.push_back(common_arg( + {"load-on-startup"}, "NAME", + "in server router mode, autoload this model on startup", + [](common_params &, const std::string &) { /* unused */ } + ).set_env(COMMON_ARG_PRESET_LOAD_ON_STARTUP).set_preset_only()); + + // args.push_back(common_arg( + // {"pin"}, + // "in server router mode, do not unload this model if models_max is exceeded", + // [](common_params &) { /* unused */ } + // ).set_preset_only()); + + // args.push_back(common_arg( + // {"unload-idle-seconds"}, "SECONDS", + // "in server router mode, unload models idle for more than this many seconds", + // [](common_params &, int) { /* unused */ } + // ).set_preset_only()); +} diff --git a/common/arg.h b/common/arg.h index 1321595c1a..f5111c658f 100644 --- a/common/arg.h +++ b/common/arg.h @@ -8,6 +8,9 @@ #include #include +// pseudo-env variable to identify preset-only arguments +#define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP" + // // CLI argument parsing // @@ -22,6 +25,7 @@ struct common_arg { const char * env = nullptr; std::string help; bool is_sparam = false; // is current arg a sampling param? + bool is_preset_only = false; // is current arg preset-only (not treated as CLI arg) void (*handler_void) (common_params & params) = nullptr; void (*handler_string) (common_params & params, const std::string &) = nullptr; void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr; @@ -70,6 +74,7 @@ struct common_arg { common_arg & set_excludes(std::initializer_list excludes); common_arg & set_env(const char * env); common_arg & set_sparam(); + common_arg & set_preset_only(); bool in_example(enum llama_example ex); bool is_exclude(enum llama_example ex); bool get_value_from_env(std::string & output) const; @@ -114,9 +119,13 @@ struct common_params_context { bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); // parse input arguments from CLI into a map -// TODO: support repeated args in the future bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map & out_map); +// populate preset-only arguments +// these arguments are not treated as command line arguments +// see: https://github.com/ggml-org/llama.cpp/issues/18163 +void common_params_add_preset_options(std::vector & args); + // initialize argument parser context - used by test-arg-parser and preset common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); diff --git a/common/preset.cpp b/common/preset.cpp index 1aa9864d0a..e2fc18c5da 100644 --- a/common/preset.cpp +++ b/common/preset.cpp @@ -24,7 +24,14 @@ std::vector common_preset::to_args(const std::string & bin_path) co } for (const auto & [opt, value] : options) { - args.push_back(opt.args.back()); // use the last arg as the main arg + if (opt.is_preset_only) { + continue; // skip preset-only options (they are not CLI args) + } + + // use the last arg as the main arg (i.e. --long-form) + args.push_back(opt.args.back()); + + // handle value(s) if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) { // flag option, no value if (common_arg_utils::is_falsey(value)) { @@ -224,8 +231,10 @@ static std::string parse_bool_arg(const common_arg & arg, const std::string & ke } common_preset_context::common_preset_context(llama_example ex) - : ctx_params(common_params_parser_init(default_params, ex)), - key_to_opt(get_map_key_opt(ctx_params)) {} + : ctx_params(common_params_parser_init(default_params, ex)) { + common_params_add_preset_options(ctx_params.options); + key_to_opt = get_map_key_opt(ctx_params); +} common_presets common_preset_context::load_from_ini(const std::string & path, common_preset & global) const { common_presets out; diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp index 74573c34e9..1bbb745e78 100644 --- a/tests/test-arg-parser.cpp +++ b/tests/test-arg-parser.cpp @@ -16,6 +16,7 @@ int main(void) { for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) { try { auto ctx_arg = common_params_parser_init(params, (enum llama_example)ex); + common_params_add_preset_options(ctx_arg.options); std::unordered_set seen_args; std::unordered_set seen_env_vars; for (const auto & opt : ctx_arg.options) { diff --git a/tools/server/README.md b/tools/server/README.md index 7454188f2b..a67155c502 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -1480,6 +1480,9 @@ The precedence rule for preset options is as follows: 2. **Model-specific options** defined in the preset file (e.g. `[ggml-org/MY-MODEL...]`) 3. **Global options** defined in the preset file (`[*]`) +We also offer additional options that are exclusive to presets (these aren't treated as command-line arguments): +- `load-on-startup` (boolean): Controls whether the model loads automatically when the server starts + ### Routing requests Requests are routed according to the requested model name. diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index db7ab667f9..08a0da5c87 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -226,6 +226,26 @@ void server_models::load_models() { SRV_INF(" %c %s\n", has_custom ? '*' : ' ', name.c_str()); } } + + // load any autoload models + std::vector models_to_load; + for (const auto & [name, inst] : mapping) { + std::string val; + if (inst.meta.preset.get_option(COMMON_ARG_PRESET_LOAD_ON_STARTUP, val)) { + models_to_load.push_back(name); + } + } + if ((int)models_to_load.size() > base_params.models_max) { + throw std::runtime_error(string_format( + "number of models to load on startup (%zu) exceeds models_max (%d)", + models_to_load.size(), + base_params.models_max + )); + } + for (const auto & name : models_to_load) { + SRV_INF("(startup) loading model %s\n", name.c_str()); + load(name); + } } void server_models::update_meta(const std::string & name, const server_model_meta & meta) { diff --git a/tools/server/server-models.h b/tools/server/server-models.h index 56fb398e31..3e1868c27c 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -103,27 +103,29 @@ public: void load_models(); - // check if a model instance exists + // check if a model instance exists (thread-safe) bool has_model(const std::string & name); - // return a copy of model metadata + // return a copy of model metadata (thread-safe) std::optional get_meta(const std::string & name); - // return a copy of all model metadata + // return a copy of all model metadata (thread-safe) std::vector get_all_meta(); + // load and unload model instances + // these functions are thread-safe void load(const std::string & name); void unload(const std::string & name); void unload_all(); - // update the status of a model instance + // update the status of a model instance (thread-safe) void update_status(const std::string & name, server_model_status status); - // wait until the model instance is fully loaded + // wait until the model instance is fully loaded (thread-safe) // return when the model is loaded or failed to load void wait_until_loaded(const std::string & name); - // load the model if not loaded, otherwise do nothing + // load the model if not loaded, otherwise do nothing (thread-safe) // return false if model is already loaded; return true otherwise (meta may need to be refreshed) bool ensure_model_loaded(const std::string & name);