diff --git a/common/arg.cpp b/common/arg.cpp index 649216b7f0..b053a25a1e 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3044,6 +3044,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.models_max = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX")); + add_opt(common_arg( + {"--models-memory-margin"}, "N", + string_format("for router server, MB of memory to leave free, per device (default: %d, 0 = unlimited)", params.models_memory_margin), + [](common_params & params, int value) { + params.models_memory_margin = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MEMORY_MARGIN")); add_opt(common_arg( {"--models-autoload"}, {"--no-models-autoload"}, diff --git a/common/common.h b/common/common.h index 31a337daa6..879032977b 100644 --- a/common/common.h +++ b/common/common.h @@ -618,10 +618,11 @@ struct common_params { std::vector server_tools; // router server configs - std::string models_dir = ""; // directory containing models for the router server - std::string models_preset = ""; // directory containing model presets for the router server - int models_max = 4; // maximum number of models to load simultaneously - bool models_autoload = true; // automatically load models when requested via the router server + std::string models_dir = ""; // directory containing models for the router server + std::string models_preset = ""; // directory containing model presets for the router server + int models_max = 4; // maximum number of models to load simultaneously + int models_memory_margin = 1024; // MB of free memory to preserve per device (0 = disabled) + bool models_autoload = true; // automatically load models when requested via the router server bool log_json = false; diff --git a/src/llama-context.cpp b/src/llama-context.cpp index a808e3e454..a9b001bcbd 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -3610,6 +3610,19 @@ void llama_memory_breakdown_print(const struct llama_context * ctx) { } } +uint64_t llama_context_device_memory(const llama_context * ctx, ggml_backend_dev_t device) { + const bool is_host = ggml_backend_dev_type(device) == GGML_BACKEND_DEVICE_TYPE_CPU; + uint64_t total = 0; + for (const auto & [buft, mb] : ctx->memory_breakdown()) { + const bool matches = is_host ? ggml_backend_buft_is_host(buft) : + ggml_backend_buft_get_device(buft) == device; + if (matches) { + total += mb.total(); + } + } + return total; +} + // // training // diff --git a/src/llama-ext.h b/src/llama-ext.h index 2ffb77934e..ee6ff27be3 100644 --- a/src/llama-ext.h +++ b/src/llama-ext.h @@ -54,3 +54,9 @@ LLAMA_API void llama_quant_compute_types( ggml_tensor ** tensors, ggml_type * result_types, size_t n_tensors); + +// Returns the projected memory use (model + context + compute) in bytes +// for the given device within this context. Returns 0 if the device is not used. +LLAMA_API uint64_t llama_context_device_memory( + const struct llama_context * ctx, + ggml_backend_dev_t device); diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index c83709272f..fca6beb122 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -7,6 +7,8 @@ #include // TODO: remove this once we use HTTP client from download.h #include +#include "../../src/llama-ext.h" + #include #include #include @@ -178,6 +180,25 @@ server_models::server_models( LOG_WRN("failed to get server executable path: %s\n", e.what()); LOG_WRN("using original argv[0] as fallback: %s\n", argv[0]); } + + const uint64_t memory_margin = (uint64_t)base_params.models_memory_margin * 1024 * 1024; + + if (memory_margin > 0) { + const size_t n_devs = ggml_backend_dev_count(); + for (size_t i = 0; i < n_devs; i++) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + size_t free, total; + ggml_backend_dev_memory(dev, &free, &total); + if (total > 0) { + const uint64_t available = (free > memory_margin) ? free - memory_margin : 0; + memory_per_device[dev] = available; + SRV_DBG("device %s: available memory after margin=%lu MB\n", + ggml_backend_dev_name(dev), + (unsigned long)(available / (1024 * 1024))); + } + } + } + load_models(); } @@ -293,16 +314,17 @@ void server_models::load_models() { // convert presets to server_model_meta and add to mapping for (const auto & preset : final_presets) { server_model_meta meta{ - /* preset */ preset.second, - /* name */ preset.first, - /* aliases */ {}, - /* tags */ {}, - /* port */ 0, - /* status */ SERVER_MODEL_STATUS_UNLOADED, - /* last_used */ 0, - /* args */ std::vector(), - /* exit_code */ 0, - /* stop_timeout */ DEFAULT_STOP_TIMEOUT, + /* preset */ preset.second, + /* name */ preset.first, + /* aliases */ {}, + /* tags */ {}, + /* port */ 0, + /* status */ SERVER_MODEL_STATUS_UNLOADED, + /* last_used */ 0, + /* memory_per_device */ {}, + /* args */ std::vector(), + /* exit_code */ 0, + /* stop_timeout */ DEFAULT_STOP_TIMEOUT, }; add_model(std::move(meta)); } @@ -493,44 +515,159 @@ std::vector server_models::get_all_meta() { return result; } -void server_models::unload_lru() { - if (base_params.models_max <= 0) { - return; // no limit - } - // remove one of the servers if we passed the models_max (least recently used - LRU) - std::string lru_model_name = ""; - int64_t lru_last_used = ggml_time_ms(); - size_t count_active = 0; - { - std::unique_lock lk(mutex); - for (const auto & m : mapping) { - if (m.second.meta.is_running()) { - count_active++; - if (m.second.meta.last_used < lru_last_used) { - lru_model_name = m.first; - lru_last_used = m.second.meta.last_used; - } +uint64_t server_models::get_memory_exceeded(const model_memory_map& new_model_memory_per_device) const { + model_memory_map total_memory_per_device; + for (const auto & m : mapping) { + if (m.second.meta.is_running()) { + for (const auto& [key, value] : m.second.meta.memory_per_device) { + total_memory_per_device[key] += value; } } } - if (!lru_model_name.empty() && count_active >= (size_t)base_params.models_max) { - SRV_INF("models_max limit reached, removing LRU name=%s\n", lru_model_name.c_str()); - unload(lru_model_name); - // wait for unload to complete - { - std::unique_lock lk(mutex); - cv.wait(lk, [this, &lru_model_name]() { - return mapping[lru_model_name].meta.status == SERVER_MODEL_STATUS_UNLOADED; - }); + + auto get = [](const model_memory_map & m, ggml_backend_dev_t k) { + auto it = m.find(k); + return it != m.end() ? it->second : 0; + }; + + uint64_t memory_exceeded = 0; + + for (const auto& [key, limit] : memory_per_device) { + const uint64_t total_memory = get(total_memory_per_device, key); + const uint64_t new_memory = get(new_model_memory_per_device, key); + SRV_DBG("device %s: total=%lu MB, new=%lu MB, limit=%lu MB\n", + ggml_backend_dev_name(key), + (unsigned long)(total_memory / (1024 * 1024)), + (unsigned long)(new_memory / (1024 * 1024)), + (unsigned long)(limit / (1024 * 1024))); + + if (total_memory + new_memory > limit) { + memory_exceeded++; } } + + return memory_exceeded; +} + +void server_models::unload_lru(const model_memory_map& new_model_memory_per_device) { + const bool check_memory = base_params.models_memory_margin > 0 && !memory_per_device.empty(); + + if (base_params.models_max <= 0 && !check_memory) { + return; // no limit + } + + while (true) { + std::string lru_model_name = ""; + int64_t lru_last_used = ggml_time_ms(); + size_t count_active = 0; + uint64_t memory_exceeded = 0; + { + std::unique_lock lk(mutex); + for (const auto & m : mapping) { + if (m.second.meta.is_running()) { + count_active++; + if (m.second.meta.last_used < lru_last_used) { + lru_model_name = m.first; + lru_last_used = m.second.meta.last_used; + } + } + } + memory_exceeded = get_memory_exceeded(new_model_memory_per_device); + } + bool count_exceeded = base_params.models_max > 0 && + (count_active + 1) > (size_t)base_params.models_max; + + if (!lru_model_name.empty() && (count_exceeded || memory_exceeded > 0)) { + SRV_INF("limits reached (count=%zu, memory margin exceeded on %zu device(s)), removing LRU name=%s\n", + count_active, memory_exceeded, lru_model_name.c_str()); + unload(lru_model_name); + // wait for unload to complete + { + std::unique_lock lk(mutex); + cv.wait(lk, [this, &lru_model_name]() { + return mapping[lru_model_name].meta.status == SERVER_MODEL_STATUS_UNLOADED; + }); + } + } else { + break; + } + } +} + +static model_memory_map get_model_memory_per_device(const common_preset& preset) { + common_params params; + preset.apply_to_params(params); + + if(params.model.path.empty()) { + return {}; + } + + struct log_ud_t { + struct { + ggml_log_callback callback; + void * user_data; + } original; + ggml_log_level min_level; + } log_ud; + llama_log_get(&log_ud.original.callback, &log_ud.original.user_data); + log_ud.min_level = GGML_LOG_LEVEL_WARN; + + llama_log_set([](ggml_log_level level, const char * text, void * ud) { + log_ud_t * d = (log_ud_t *) ud; + const ggml_log_level eff = level >= d->min_level ? level : GGML_LOG_LEVEL_DEBUG; + d->original.callback(eff, text, d->original.user_data); + }, &log_ud); + + llama_model_params mparams = common_model_params_to_llama(params); + mparams.no_alloc = true; + mparams.use_mmap = false; + mparams.use_mlock = false; + + llama_model_ptr model{llama_model_load_from_file(params.model.path.c_str(), mparams)}; + + if (!model) { + llama_log_set(log_ud.original.callback, log_ud.original.user_data); + return {}; + } + + llama_context_params cparams = common_context_params_to_llama(params); + llama_context_ptr ctx{llama_init_from_model(model.get(), cparams)}; + llama_log_set(log_ud.original.callback, log_ud.original.user_data); + + if (!ctx) { + return {}; + } + + model_memory_map result; + const size_t n_devs = ggml_backend_dev_count(); + for (size_t i = 0; i < n_devs; i++) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + uint64_t bytes = llama_context_device_memory(ctx.get(), dev); + if (bytes > 0) { + result[dev] = bytes; + } + } + + return result; } void server_models::load(const std::string & name) { if (!has_model(name)) { throw std::runtime_error("model name=" + name + " is not found"); } - unload_lru(); + + model_memory_map new_model_memory_per_device; + if (base_params.models_memory_margin > 0) { + std::lock_guard lk(mutex); + auto & meta = mapping[name].meta; + if (meta.memory_per_device.empty()) { + meta.memory_per_device = get_model_memory_per_device(meta.preset); + } + + new_model_memory_per_device = meta.memory_per_device; + } + + unload_lru(new_model_memory_per_device); std::lock_guard lk(mutex); @@ -544,14 +681,16 @@ void server_models::load(const std::string & name) { // exceeding models_max. Without this, the window between unload_lru() // releasing its lock and this lock_guard acquiring allows multiple // threads to each observe capacity and all proceed to load. - if (base_params.models_max > 0) { + if (base_params.models_max > 0 || base_params.models_memory_margin > 0) { size_t count_active = 0; for (const auto & m : mapping) { if (m.second.meta.is_running()) { count_active++; } } - if (count_active >= (size_t)base_params.models_max) { + bool count_exceeded = base_params.models_max > 0 && count_active >= (size_t)base_params.models_max; + bool memory_exceeded = get_memory_exceeded(new_model_memory_per_device) > 0; + if (count_exceeded || memory_exceeded) { throw std::runtime_error("model limit reached, try again later"); } } diff --git a/tools/server/server-models.h b/tools/server/server-models.h index 1db34b6c4d..38d6929a88 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -54,6 +54,8 @@ static std::string server_model_status_to_string(server_model_status status) { } } +using model_memory_map = std::map; + struct server_model_meta { common_preset preset; std::string name; @@ -62,6 +64,7 @@ struct server_model_meta { int port = 0; server_model_status status = SERVER_MODEL_STATUS_UNLOADED; int64_t last_used = 0; // for LRU unloading + model_memory_map memory_per_device; // projected bytes per device std::vector args; // args passed to the model instance, will be populated by render_args() int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED) int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown @@ -107,14 +110,20 @@ private: std::vector base_env; common_preset base_preset; // base preset from llama-server CLI args + // available memory per device + std::map memory_per_device; + void update_meta(const std::string & name, const server_model_meta & meta); // unload least recently used models if the limit is reached - void unload_lru(); + void unload_lru(const model_memory_map& new_model_memory_per_device); // not thread-safe, caller must hold mutex void add_model(server_model_meta && meta); + // not thread-safe, caller must hold mutex + uint64_t get_memory_exceeded(const model_memory_map& new_model_memory_per_device) const; + public: server_models(const common_params & params, int argc, char ** argv);