server: support load model on startup, support preset-only options (#18206)
* server: support autoload model, support preset-only options * add docs * load-on-startup * fix * Update common/arg.cpp Co-authored-by: Pascal <admin@serveurperso.com> --------- Co-authored-by: Pascal <admin@serveurperso.com>
This commit is contained in:
parent
74e05131e9
commit
9e39a1e6a9
|
|
@ -96,6 +96,11 @@ common_arg & common_arg::set_sparam() {
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
common_arg & common_arg::set_preset_only() {
|
||||||
|
is_preset_only = true;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
bool common_arg::in_example(enum llama_example ex) {
|
bool common_arg::in_example(enum llama_example ex) {
|
||||||
return examples.find(ex) != examples.end();
|
return examples.find(ex) != examples.end();
|
||||||
}
|
}
|
||||||
|
|
@ -3494,3 +3499,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
|
|
||||||
return ctx_arg;
|
return ctx_arg;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void common_params_add_preset_options(std::vector<common_arg> & args) {
|
||||||
|
// arguments below won't be treated as CLI args, only preset options
|
||||||
|
args.push_back(common_arg(
|
||||||
|
{"load-on-startup"}, "NAME",
|
||||||
|
"in server router mode, autoload this model on startup",
|
||||||
|
[](common_params &, const std::string &) { /* unused */ }
|
||||||
|
).set_env(COMMON_ARG_PRESET_LOAD_ON_STARTUP).set_preset_only());
|
||||||
|
|
||||||
|
// args.push_back(common_arg(
|
||||||
|
// {"pin"},
|
||||||
|
// "in server router mode, do not unload this model if models_max is exceeded",
|
||||||
|
// [](common_params &) { /* unused */ }
|
||||||
|
// ).set_preset_only());
|
||||||
|
|
||||||
|
// args.push_back(common_arg(
|
||||||
|
// {"unload-idle-seconds"}, "SECONDS",
|
||||||
|
// "in server router mode, unload models idle for more than this many seconds",
|
||||||
|
// [](common_params &, int) { /* unused */ }
|
||||||
|
// ).set_preset_only());
|
||||||
|
}
|
||||||
|
|
|
||||||
11
common/arg.h
11
common/arg.h
|
|
@ -8,6 +8,9 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
|
||||||
|
// pseudo-env variable to identify preset-only arguments
|
||||||
|
#define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP"
|
||||||
|
|
||||||
//
|
//
|
||||||
// CLI argument parsing
|
// CLI argument parsing
|
||||||
//
|
//
|
||||||
|
|
@ -22,6 +25,7 @@ struct common_arg {
|
||||||
const char * env = nullptr;
|
const char * env = nullptr;
|
||||||
std::string help;
|
std::string help;
|
||||||
bool is_sparam = false; // is current arg a sampling param?
|
bool is_sparam = false; // is current arg a sampling param?
|
||||||
|
bool is_preset_only = false; // is current arg preset-only (not treated as CLI arg)
|
||||||
void (*handler_void) (common_params & params) = nullptr;
|
void (*handler_void) (common_params & params) = nullptr;
|
||||||
void (*handler_string) (common_params & params, const std::string &) = nullptr;
|
void (*handler_string) (common_params & params, const std::string &) = nullptr;
|
||||||
void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
|
void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
|
||||||
|
|
@ -70,6 +74,7 @@ struct common_arg {
|
||||||
common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
|
common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
|
||||||
common_arg & set_env(const char * env);
|
common_arg & set_env(const char * env);
|
||||||
common_arg & set_sparam();
|
common_arg & set_sparam();
|
||||||
|
common_arg & set_preset_only();
|
||||||
bool in_example(enum llama_example ex);
|
bool in_example(enum llama_example ex);
|
||||||
bool is_exclude(enum llama_example ex);
|
bool is_exclude(enum llama_example ex);
|
||||||
bool get_value_from_env(std::string & output) const;
|
bool get_value_from_env(std::string & output) const;
|
||||||
|
|
@ -114,9 +119,13 @@ struct common_params_context {
|
||||||
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||||
|
|
||||||
// parse input arguments from CLI into a map
|
// parse input arguments from CLI into a map
|
||||||
// TODO: support repeated args in the future
|
|
||||||
bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map);
|
bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map);
|
||||||
|
|
||||||
|
// populate preset-only arguments
|
||||||
|
// these arguments are not treated as command line arguments
|
||||||
|
// see: https://github.com/ggml-org/llama.cpp/issues/18163
|
||||||
|
void common_params_add_preset_options(std::vector<common_arg> & args);
|
||||||
|
|
||||||
// initialize argument parser context - used by test-arg-parser and preset
|
// initialize argument parser context - used by test-arg-parser and preset
|
||||||
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -24,7 +24,14 @@ std::vector<std::string> common_preset::to_args(const std::string & bin_path) co
|
||||||
}
|
}
|
||||||
|
|
||||||
for (const auto & [opt, value] : options) {
|
for (const auto & [opt, value] : options) {
|
||||||
args.push_back(opt.args.back()); // use the last arg as the main arg
|
if (opt.is_preset_only) {
|
||||||
|
continue; // skip preset-only options (they are not CLI args)
|
||||||
|
}
|
||||||
|
|
||||||
|
// use the last arg as the main arg (i.e. --long-form)
|
||||||
|
args.push_back(opt.args.back());
|
||||||
|
|
||||||
|
// handle value(s)
|
||||||
if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
|
if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
|
||||||
// flag option, no value
|
// flag option, no value
|
||||||
if (common_arg_utils::is_falsey(value)) {
|
if (common_arg_utils::is_falsey(value)) {
|
||||||
|
|
@ -224,8 +231,10 @@ static std::string parse_bool_arg(const common_arg & arg, const std::string & ke
|
||||||
}
|
}
|
||||||
|
|
||||||
common_preset_context::common_preset_context(llama_example ex)
|
common_preset_context::common_preset_context(llama_example ex)
|
||||||
: ctx_params(common_params_parser_init(default_params, ex)),
|
: ctx_params(common_params_parser_init(default_params, ex)) {
|
||||||
key_to_opt(get_map_key_opt(ctx_params)) {}
|
common_params_add_preset_options(ctx_params.options);
|
||||||
|
key_to_opt = get_map_key_opt(ctx_params);
|
||||||
|
}
|
||||||
|
|
||||||
common_presets common_preset_context::load_from_ini(const std::string & path, common_preset & global) const {
|
common_presets common_preset_context::load_from_ini(const std::string & path, common_preset & global) const {
|
||||||
common_presets out;
|
common_presets out;
|
||||||
|
|
|
||||||
|
|
@ -16,6 +16,7 @@ int main(void) {
|
||||||
for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) {
|
for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) {
|
||||||
try {
|
try {
|
||||||
auto ctx_arg = common_params_parser_init(params, (enum llama_example)ex);
|
auto ctx_arg = common_params_parser_init(params, (enum llama_example)ex);
|
||||||
|
common_params_add_preset_options(ctx_arg.options);
|
||||||
std::unordered_set<std::string> seen_args;
|
std::unordered_set<std::string> seen_args;
|
||||||
std::unordered_set<std::string> seen_env_vars;
|
std::unordered_set<std::string> seen_env_vars;
|
||||||
for (const auto & opt : ctx_arg.options) {
|
for (const auto & opt : ctx_arg.options) {
|
||||||
|
|
|
||||||
|
|
@ -1480,6 +1480,9 @@ The precedence rule for preset options is as follows:
|
||||||
2. **Model-specific options** defined in the preset file (e.g. `[ggml-org/MY-MODEL...]`)
|
2. **Model-specific options** defined in the preset file (e.g. `[ggml-org/MY-MODEL...]`)
|
||||||
3. **Global options** defined in the preset file (`[*]`)
|
3. **Global options** defined in the preset file (`[*]`)
|
||||||
|
|
||||||
|
We also offer additional options that are exclusive to presets (these aren't treated as command-line arguments):
|
||||||
|
- `load-on-startup` (boolean): Controls whether the model loads automatically when the server starts
|
||||||
|
|
||||||
### Routing requests
|
### Routing requests
|
||||||
|
|
||||||
Requests are routed according to the requested model name.
|
Requests are routed according to the requested model name.
|
||||||
|
|
|
||||||
|
|
@ -226,6 +226,26 @@ void server_models::load_models() {
|
||||||
SRV_INF(" %c %s\n", has_custom ? '*' : ' ', name.c_str());
|
SRV_INF(" %c %s\n", has_custom ? '*' : ' ', name.c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// load any autoload models
|
||||||
|
std::vector<std::string> models_to_load;
|
||||||
|
for (const auto & [name, inst] : mapping) {
|
||||||
|
std::string val;
|
||||||
|
if (inst.meta.preset.get_option(COMMON_ARG_PRESET_LOAD_ON_STARTUP, val)) {
|
||||||
|
models_to_load.push_back(name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ((int)models_to_load.size() > base_params.models_max) {
|
||||||
|
throw std::runtime_error(string_format(
|
||||||
|
"number of models to load on startup (%zu) exceeds models_max (%d)",
|
||||||
|
models_to_load.size(),
|
||||||
|
base_params.models_max
|
||||||
|
));
|
||||||
|
}
|
||||||
|
for (const auto & name : models_to_load) {
|
||||||
|
SRV_INF("(startup) loading model %s\n", name.c_str());
|
||||||
|
load(name);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void server_models::update_meta(const std::string & name, const server_model_meta & meta) {
|
void server_models::update_meta(const std::string & name, const server_model_meta & meta) {
|
||||||
|
|
|
||||||
|
|
@ -103,27 +103,29 @@ public:
|
||||||
|
|
||||||
void load_models();
|
void load_models();
|
||||||
|
|
||||||
// check if a model instance exists
|
// check if a model instance exists (thread-safe)
|
||||||
bool has_model(const std::string & name);
|
bool has_model(const std::string & name);
|
||||||
|
|
||||||
// return a copy of model metadata
|
// return a copy of model metadata (thread-safe)
|
||||||
std::optional<server_model_meta> get_meta(const std::string & name);
|
std::optional<server_model_meta> get_meta(const std::string & name);
|
||||||
|
|
||||||
// return a copy of all model metadata
|
// return a copy of all model metadata (thread-safe)
|
||||||
std::vector<server_model_meta> get_all_meta();
|
std::vector<server_model_meta> get_all_meta();
|
||||||
|
|
||||||
|
// load and unload model instances
|
||||||
|
// these functions are thread-safe
|
||||||
void load(const std::string & name);
|
void load(const std::string & name);
|
||||||
void unload(const std::string & name);
|
void unload(const std::string & name);
|
||||||
void unload_all();
|
void unload_all();
|
||||||
|
|
||||||
// update the status of a model instance
|
// update the status of a model instance (thread-safe)
|
||||||
void update_status(const std::string & name, server_model_status status);
|
void update_status(const std::string & name, server_model_status status);
|
||||||
|
|
||||||
// wait until the model instance is fully loaded
|
// wait until the model instance is fully loaded (thread-safe)
|
||||||
// return when the model is loaded or failed to load
|
// return when the model is loaded or failed to load
|
||||||
void wait_until_loaded(const std::string & name);
|
void wait_until_loaded(const std::string & name);
|
||||||
|
|
||||||
// load the model if not loaded, otherwise do nothing
|
// load the model if not loaded, otherwise do nothing (thread-safe)
|
||||||
// return false if model is already loaded; return true otherwise (meta may need to be refreshed)
|
// return false if model is already loaded; return true otherwise (meta may need to be refreshed)
|
||||||
bool ensure_model_loaded(const std::string & name);
|
bool ensure_model_loaded(const std::string & name);
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue