Merge 1d4a5f9380 into 0c58ba3365
This commit is contained in:
commit
8fd211ca47
|
|
@ -3044,6 +3044,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
params.models_max = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
|
||||
add_opt(common_arg(
|
||||
{"--models-memory-margin"}, "N",
|
||||
string_format("for router server, MB of memory to leave free, per device (default: %d, 0 = unlimited)", params.models_memory_margin),
|
||||
[](common_params & params, int value) {
|
||||
params.models_memory_margin = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MEMORY_MARGIN"));
|
||||
add_opt(common_arg(
|
||||
{"--models-autoload"},
|
||||
{"--no-models-autoload"},
|
||||
|
|
|
|||
|
|
@ -618,10 +618,11 @@ struct common_params {
|
|||
std::vector<std::string> server_tools;
|
||||
|
||||
// router server configs
|
||||
std::string models_dir = ""; // directory containing models for the router server
|
||||
std::string models_preset = ""; // directory containing model presets for the router server
|
||||
int models_max = 4; // maximum number of models to load simultaneously
|
||||
bool models_autoload = true; // automatically load models when requested via the router server
|
||||
std::string models_dir = ""; // directory containing models for the router server
|
||||
std::string models_preset = ""; // directory containing model presets for the router server
|
||||
int models_max = 4; // maximum number of models to load simultaneously
|
||||
int models_memory_margin = 1024; // MB of free memory to preserve per device (0 = disabled)
|
||||
bool models_autoload = true; // automatically load models when requested via the router server
|
||||
|
||||
bool log_json = false;
|
||||
|
||||
|
|
|
|||
|
|
@ -3610,6 +3610,19 @@ void llama_memory_breakdown_print(const struct llama_context * ctx) {
|
|||
}
|
||||
}
|
||||
|
||||
uint64_t llama_context_device_memory(const llama_context * ctx, ggml_backend_dev_t device) {
|
||||
const bool is_host = ggml_backend_dev_type(device) == GGML_BACKEND_DEVICE_TYPE_CPU;
|
||||
uint64_t total = 0;
|
||||
for (const auto & [buft, mb] : ctx->memory_breakdown()) {
|
||||
const bool matches = is_host ? ggml_backend_buft_is_host(buft) :
|
||||
ggml_backend_buft_get_device(buft) == device;
|
||||
if (matches) {
|
||||
total += mb.total();
|
||||
}
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
//
|
||||
// training
|
||||
//
|
||||
|
|
|
|||
|
|
@ -54,3 +54,9 @@ LLAMA_API void llama_quant_compute_types(
|
|||
ggml_tensor ** tensors,
|
||||
ggml_type * result_types,
|
||||
size_t n_tensors);
|
||||
|
||||
// Returns the projected memory use (model + context + compute) in bytes
|
||||
// for the given device within this context. Returns 0 if the device is not used.
|
||||
LLAMA_API uint64_t llama_context_device_memory(
|
||||
const struct llama_context * ctx,
|
||||
ggml_backend_dev_t device);
|
||||
|
|
|
|||
|
|
@ -7,6 +7,8 @@
|
|||
#include <cpp-httplib/httplib.h> // TODO: remove this once we use HTTP client from download.h
|
||||
#include <sheredom/subprocess.h>
|
||||
|
||||
#include "../../src/llama-ext.h"
|
||||
|
||||
#include <functional>
|
||||
#include <algorithm>
|
||||
#include <thread>
|
||||
|
|
@ -178,6 +180,25 @@ server_models::server_models(
|
|||
LOG_WRN("failed to get server executable path: %s\n", e.what());
|
||||
LOG_WRN("using original argv[0] as fallback: %s\n", argv[0]);
|
||||
}
|
||||
|
||||
const uint64_t memory_margin = (uint64_t)base_params.models_memory_margin * 1024 * 1024;
|
||||
|
||||
if (memory_margin > 0) {
|
||||
const size_t n_devs = ggml_backend_dev_count();
|
||||
for (size_t i = 0; i < n_devs; i++) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||
size_t free, total;
|
||||
ggml_backend_dev_memory(dev, &free, &total);
|
||||
if (total > 0) {
|
||||
const uint64_t available = (free > memory_margin) ? free - memory_margin : 0;
|
||||
memory_per_device[dev] = available;
|
||||
SRV_DBG("device %s: available memory after margin=%lu MB\n",
|
||||
ggml_backend_dev_name(dev),
|
||||
(unsigned long)(available / (1024 * 1024)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
load_models();
|
||||
}
|
||||
|
||||
|
|
@ -293,16 +314,17 @@ void server_models::load_models() {
|
|||
// convert presets to server_model_meta and add to mapping
|
||||
for (const auto & preset : final_presets) {
|
||||
server_model_meta meta{
|
||||
/* preset */ preset.second,
|
||||
/* name */ preset.first,
|
||||
/* aliases */ {},
|
||||
/* tags */ {},
|
||||
/* port */ 0,
|
||||
/* status */ SERVER_MODEL_STATUS_UNLOADED,
|
||||
/* last_used */ 0,
|
||||
/* args */ std::vector<std::string>(),
|
||||
/* exit_code */ 0,
|
||||
/* stop_timeout */ DEFAULT_STOP_TIMEOUT,
|
||||
/* preset */ preset.second,
|
||||
/* name */ preset.first,
|
||||
/* aliases */ {},
|
||||
/* tags */ {},
|
||||
/* port */ 0,
|
||||
/* status */ SERVER_MODEL_STATUS_UNLOADED,
|
||||
/* last_used */ 0,
|
||||
/* memory_per_device */ {},
|
||||
/* args */ std::vector<std::string>(),
|
||||
/* exit_code */ 0,
|
||||
/* stop_timeout */ DEFAULT_STOP_TIMEOUT,
|
||||
};
|
||||
add_model(std::move(meta));
|
||||
}
|
||||
|
|
@ -493,44 +515,159 @@ std::vector<server_model_meta> server_models::get_all_meta() {
|
|||
return result;
|
||||
}
|
||||
|
||||
void server_models::unload_lru() {
|
||||
if (base_params.models_max <= 0) {
|
||||
return; // no limit
|
||||
}
|
||||
// remove one of the servers if we passed the models_max (least recently used - LRU)
|
||||
std::string lru_model_name = "";
|
||||
int64_t lru_last_used = ggml_time_ms();
|
||||
size_t count_active = 0;
|
||||
{
|
||||
std::unique_lock<std::mutex> lk(mutex);
|
||||
for (const auto & m : mapping) {
|
||||
if (m.second.meta.is_running()) {
|
||||
count_active++;
|
||||
if (m.second.meta.last_used < lru_last_used) {
|
||||
lru_model_name = m.first;
|
||||
lru_last_used = m.second.meta.last_used;
|
||||
}
|
||||
uint64_t server_models::get_memory_exceeded(const model_memory_map& new_model_memory_per_device) const {
|
||||
model_memory_map total_memory_per_device;
|
||||
for (const auto & m : mapping) {
|
||||
if (m.second.meta.is_running()) {
|
||||
for (const auto& [key, value] : m.second.meta.memory_per_device) {
|
||||
total_memory_per_device[key] += value;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!lru_model_name.empty() && count_active >= (size_t)base_params.models_max) {
|
||||
SRV_INF("models_max limit reached, removing LRU name=%s\n", lru_model_name.c_str());
|
||||
unload(lru_model_name);
|
||||
// wait for unload to complete
|
||||
{
|
||||
std::unique_lock<std::mutex> lk(mutex);
|
||||
cv.wait(lk, [this, &lru_model_name]() {
|
||||
return mapping[lru_model_name].meta.status == SERVER_MODEL_STATUS_UNLOADED;
|
||||
});
|
||||
|
||||
auto get = [](const model_memory_map & m, ggml_backend_dev_t k) {
|
||||
auto it = m.find(k);
|
||||
return it != m.end() ? it->second : 0;
|
||||
};
|
||||
|
||||
uint64_t memory_exceeded = 0;
|
||||
|
||||
for (const auto& [key, limit] : memory_per_device) {
|
||||
const uint64_t total_memory = get(total_memory_per_device, key);
|
||||
const uint64_t new_memory = get(new_model_memory_per_device, key);
|
||||
SRV_DBG("device %s: total=%lu MB, new=%lu MB, limit=%lu MB\n",
|
||||
ggml_backend_dev_name(key),
|
||||
(unsigned long)(total_memory / (1024 * 1024)),
|
||||
(unsigned long)(new_memory / (1024 * 1024)),
|
||||
(unsigned long)(limit / (1024 * 1024)));
|
||||
|
||||
if (total_memory + new_memory > limit) {
|
||||
memory_exceeded++;
|
||||
}
|
||||
}
|
||||
|
||||
return memory_exceeded;
|
||||
}
|
||||
|
||||
void server_models::unload_lru(const model_memory_map& new_model_memory_per_device) {
|
||||
const bool check_memory = base_params.models_memory_margin > 0 && !memory_per_device.empty();
|
||||
|
||||
if (base_params.models_max <= 0 && !check_memory) {
|
||||
return; // no limit
|
||||
}
|
||||
|
||||
while (true) {
|
||||
std::string lru_model_name = "";
|
||||
int64_t lru_last_used = ggml_time_ms();
|
||||
size_t count_active = 0;
|
||||
uint64_t memory_exceeded = 0;
|
||||
{
|
||||
std::unique_lock<std::mutex> lk(mutex);
|
||||
for (const auto & m : mapping) {
|
||||
if (m.second.meta.is_running()) {
|
||||
count_active++;
|
||||
if (m.second.meta.last_used < lru_last_used) {
|
||||
lru_model_name = m.first;
|
||||
lru_last_used = m.second.meta.last_used;
|
||||
}
|
||||
}
|
||||
}
|
||||
memory_exceeded = get_memory_exceeded(new_model_memory_per_device);
|
||||
}
|
||||
bool count_exceeded = base_params.models_max > 0 &&
|
||||
(count_active + 1) > (size_t)base_params.models_max;
|
||||
|
||||
if (!lru_model_name.empty() && (count_exceeded || memory_exceeded > 0)) {
|
||||
SRV_INF("limits reached (count=%zu, memory margin exceeded on %zu device(s)), removing LRU name=%s\n",
|
||||
count_active, memory_exceeded, lru_model_name.c_str());
|
||||
unload(lru_model_name);
|
||||
// wait for unload to complete
|
||||
{
|
||||
std::unique_lock<std::mutex> lk(mutex);
|
||||
cv.wait(lk, [this, &lru_model_name]() {
|
||||
return mapping[lru_model_name].meta.status == SERVER_MODEL_STATUS_UNLOADED;
|
||||
});
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static model_memory_map get_model_memory_per_device(const common_preset& preset) {
|
||||
common_params params;
|
||||
preset.apply_to_params(params);
|
||||
|
||||
if(params.model.path.empty()) {
|
||||
return {};
|
||||
}
|
||||
|
||||
struct log_ud_t {
|
||||
struct {
|
||||
ggml_log_callback callback;
|
||||
void * user_data;
|
||||
} original;
|
||||
ggml_log_level min_level;
|
||||
} log_ud;
|
||||
llama_log_get(&log_ud.original.callback, &log_ud.original.user_data);
|
||||
log_ud.min_level = GGML_LOG_LEVEL_WARN;
|
||||
|
||||
llama_log_set([](ggml_log_level level, const char * text, void * ud) {
|
||||
log_ud_t * d = (log_ud_t *) ud;
|
||||
const ggml_log_level eff = level >= d->min_level ? level : GGML_LOG_LEVEL_DEBUG;
|
||||
d->original.callback(eff, text, d->original.user_data);
|
||||
}, &log_ud);
|
||||
|
||||
llama_model_params mparams = common_model_params_to_llama(params);
|
||||
mparams.no_alloc = true;
|
||||
mparams.use_mmap = false;
|
||||
mparams.use_mlock = false;
|
||||
|
||||
llama_model_ptr model{llama_model_load_from_file(params.model.path.c_str(), mparams)};
|
||||
|
||||
if (!model) {
|
||||
llama_log_set(log_ud.original.callback, log_ud.original.user_data);
|
||||
return {};
|
||||
}
|
||||
|
||||
llama_context_params cparams = common_context_params_to_llama(params);
|
||||
llama_context_ptr ctx{llama_init_from_model(model.get(), cparams)};
|
||||
llama_log_set(log_ud.original.callback, log_ud.original.user_data);
|
||||
|
||||
if (!ctx) {
|
||||
return {};
|
||||
}
|
||||
|
||||
model_memory_map result;
|
||||
const size_t n_devs = ggml_backend_dev_count();
|
||||
for (size_t i = 0; i < n_devs; i++) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||
uint64_t bytes = llama_context_device_memory(ctx.get(), dev);
|
||||
if (bytes > 0) {
|
||||
result[dev] = bytes;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void server_models::load(const std::string & name) {
|
||||
if (!has_model(name)) {
|
||||
throw std::runtime_error("model name=" + name + " is not found");
|
||||
}
|
||||
unload_lru();
|
||||
|
||||
model_memory_map new_model_memory_per_device;
|
||||
if (base_params.models_memory_margin > 0) {
|
||||
std::lock_guard<std::mutex> lk(mutex);
|
||||
auto & meta = mapping[name].meta;
|
||||
if (meta.memory_per_device.empty()) {
|
||||
meta.memory_per_device = get_model_memory_per_device(meta.preset);
|
||||
}
|
||||
|
||||
new_model_memory_per_device = meta.memory_per_device;
|
||||
}
|
||||
|
||||
unload_lru(new_model_memory_per_device);
|
||||
|
||||
std::lock_guard<std::mutex> lk(mutex);
|
||||
|
||||
|
|
@ -544,14 +681,16 @@ void server_models::load(const std::string & name) {
|
|||
// exceeding models_max. Without this, the window between unload_lru()
|
||||
// releasing its lock and this lock_guard acquiring allows multiple
|
||||
// threads to each observe capacity and all proceed to load.
|
||||
if (base_params.models_max > 0) {
|
||||
if (base_params.models_max > 0 || base_params.models_memory_margin > 0) {
|
||||
size_t count_active = 0;
|
||||
for (const auto & m : mapping) {
|
||||
if (m.second.meta.is_running()) {
|
||||
count_active++;
|
||||
}
|
||||
}
|
||||
if (count_active >= (size_t)base_params.models_max) {
|
||||
bool count_exceeded = base_params.models_max > 0 && count_active >= (size_t)base_params.models_max;
|
||||
bool memory_exceeded = get_memory_exceeded(new_model_memory_per_device) > 0;
|
||||
if (count_exceeded || memory_exceeded) {
|
||||
throw std::runtime_error("model limit reached, try again later");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -54,6 +54,8 @@ static std::string server_model_status_to_string(server_model_status status) {
|
|||
}
|
||||
}
|
||||
|
||||
using model_memory_map = std::map<ggml_backend_dev_t, uint64_t>;
|
||||
|
||||
struct server_model_meta {
|
||||
common_preset preset;
|
||||
std::string name;
|
||||
|
|
@ -62,6 +64,7 @@ struct server_model_meta {
|
|||
int port = 0;
|
||||
server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
|
||||
int64_t last_used = 0; // for LRU unloading
|
||||
model_memory_map memory_per_device; // projected bytes per device
|
||||
std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
|
||||
int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
|
||||
int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown
|
||||
|
|
@ -107,14 +110,20 @@ private:
|
|||
std::vector<std::string> base_env;
|
||||
common_preset base_preset; // base preset from llama-server CLI args
|
||||
|
||||
// available memory per device
|
||||
std::map<ggml_backend_dev_t, uint64_t> memory_per_device;
|
||||
|
||||
void update_meta(const std::string & name, const server_model_meta & meta);
|
||||
|
||||
// unload least recently used models if the limit is reached
|
||||
void unload_lru();
|
||||
void unload_lru(const model_memory_map& new_model_memory_per_device);
|
||||
|
||||
// not thread-safe, caller must hold mutex
|
||||
void add_model(server_model_meta && meta);
|
||||
|
||||
// not thread-safe, caller must hold mutex
|
||||
uint64_t get_memory_exceeded(const model_memory_map& new_model_memory_per_device) const;
|
||||
|
||||
public:
|
||||
server_models(const common_params & params, int argc, char ** argv);
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue