estimate with to-be-loaded model size included
This commit is contained in:
parent
8482ffc387
commit
c2df1ac64a
|
|
@ -614,6 +614,12 @@ extern "C" {
|
||||||
// Returns the total size of all the tensors in the model in bytes
|
// Returns the total size of all the tensors in the model in bytes
|
||||||
LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
|
LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
|
||||||
|
|
||||||
|
// Returns the total size of all the tensors in the model in bytes from a model path
|
||||||
|
// without fully loading the model. Uses llama_model_loader with no_alloc=true.
|
||||||
|
// Returns 0 if the model cannot be loaded or the path is invalid.
|
||||||
|
// This function can be used to estimate memory requirements before loading a model.
|
||||||
|
LLAMA_API uint64_t llama_model_size_from_path(const char * path);
|
||||||
|
|
||||||
// Get the default chat template. Returns nullptr if not available
|
// Get the default chat template. Returns nullptr if not available
|
||||||
// If name is NULL, returns the default chat template
|
// If name is NULL, returns the default chat template
|
||||||
LLAMA_API const char * llama_model_chat_template(const struct llama_model * model, const char * name);
|
LLAMA_API const char * llama_model_chat_template(const struct llama_model * model, const char * name);
|
||||||
|
|
|
||||||
|
|
@ -9253,6 +9253,35 @@ uint64_t llama_model_size(const llama_model * model) {
|
||||||
return model->size();
|
return model->size();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uint64_t llama_model_size_from_path(const char * path) {
|
||||||
|
if (!path) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
std::vector<std::string> splits;
|
||||||
|
|
||||||
|
llama_model_loader loader(
|
||||||
|
/* metadata */ nullptr,
|
||||||
|
/* set_tensor_data */ nullptr,
|
||||||
|
/* set_tensor_data_ud */ nullptr,
|
||||||
|
/* fname */ path,
|
||||||
|
/* splits */ splits,
|
||||||
|
/* file */ nullptr,
|
||||||
|
/* use_mmap */ false,
|
||||||
|
/* use_direct_io */ false,
|
||||||
|
/* check_tensors */ false,
|
||||||
|
/* no_alloc */ true,
|
||||||
|
/* param_overrides_p */ nullptr,
|
||||||
|
/* param_tensor_buft_overrides_p */ nullptr
|
||||||
|
);
|
||||||
|
|
||||||
|
return loader.n_bytes;
|
||||||
|
} catch (...) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const char * llama_model_chat_template(const llama_model * model, const char * name) {
|
const char * llama_model_chat_template(const llama_model * model, const char * name) {
|
||||||
const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE)
|
const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE)
|
||||||
: LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
|
: LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
|
||||||
|
|
|
||||||
|
|
@ -494,11 +494,10 @@ std::vector<server_model_meta> server_models::get_all_meta() {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
void server_models::unload_lru() {
|
void server_models::unload_lru(uint64_t new_model_memory_mb) {
|
||||||
if (base_params.models_max <= 0 && base_params.models_memory_max <= 0) {
|
if (base_params.models_max <= 0 && base_params.models_memory_max <= 0) {
|
||||||
return; // no limit
|
return; // no limit
|
||||||
}
|
}
|
||||||
// Keep unloading LRU models until limits are satisfied
|
|
||||||
while (true) {
|
while (true) {
|
||||||
std::string lru_model_name = "";
|
std::string lru_model_name = "";
|
||||||
int64_t lru_last_used = ggml_time_ms();
|
int64_t lru_last_used = ggml_time_ms();
|
||||||
|
|
@ -517,12 +516,14 @@ void server_models::unload_lru() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Check if limits exceeded
|
bool count_exceeded = base_params.models_max > 0 &&
|
||||||
bool count_exceeded = base_params.models_max > 0 && count_active >= (size_t)base_params.models_max;
|
(count_active + 1) >= (size_t)base_params.models_max;
|
||||||
bool memory_exceeded = base_params.models_memory_max > 0 && total_memory_mb >= (uint64_t)base_params.models_memory_max;
|
uint64_t projected_memory = total_memory_mb + new_model_memory_mb;
|
||||||
|
bool memory_exceeded = base_params.models_memory_max > 0 &&
|
||||||
|
projected_memory >= (uint64_t)base_params.models_memory_max;
|
||||||
if (!lru_model_name.empty() && (count_exceeded || memory_exceeded)) {
|
if (!lru_model_name.empty() && (count_exceeded || memory_exceeded)) {
|
||||||
SRV_INF("limits reached (count=%zu, memory=%lu MB), removing LRU name=%s\n",
|
SRV_INF("limits reached (count=%zu, memory=%lu MB + %lu MB new), removing LRU name=%s\n",
|
||||||
count_active, (unsigned long)total_memory_mb, lru_model_name.c_str());
|
count_active, (unsigned long)total_memory_mb, (unsigned long)new_model_memory_mb, lru_model_name.c_str());
|
||||||
unload(lru_model_name);
|
unload(lru_model_name);
|
||||||
// wait for unload to complete
|
// wait for unload to complete
|
||||||
{
|
{
|
||||||
|
|
@ -531,9 +532,8 @@ void server_models::unload_lru() {
|
||||||
return mapping[lru_model_name].meta.status == SERVER_MODEL_STATUS_UNLOADED;
|
return mapping[lru_model_name].meta.status == SERVER_MODEL_STATUS_UNLOADED;
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
// Loop continues to check if more unloading is needed
|
|
||||||
} else {
|
} else {
|
||||||
break; // limits satisfied
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -542,7 +542,26 @@ void server_models::load(const std::string & name) {
|
||||||
if (!has_model(name)) {
|
if (!has_model(name)) {
|
||||||
throw std::runtime_error("model name=" + name + " is not found");
|
throw std::runtime_error("model name=" + name + " is not found");
|
||||||
}
|
}
|
||||||
unload_lru();
|
|
||||||
|
uint64_t new_model_memory_mb = 0;
|
||||||
|
if (base_params.models_memory_max > 0) {
|
||||||
|
std::string model_path;
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lk(mutex);
|
||||||
|
auto & meta = mapping[name].meta;
|
||||||
|
if (meta.preset.get_option("LLAMA_ARG_MODEL", model_path) && !model_path.empty()) {
|
||||||
|
uint64_t size_bytes = llama_model_size_from_path(model_path.c_str());
|
||||||
|
new_model_memory_mb = size_bytes / (1024 * 1024);
|
||||||
|
meta.memory_mb = new_model_memory_mb;
|
||||||
|
if (new_model_memory_mb > 0) {
|
||||||
|
SRV_INF("model %s estimated size: %lu MB\n", name.c_str(),
|
||||||
|
(unsigned long)new_model_memory_mb);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
unload_lru(new_model_memory_mb);
|
||||||
|
|
||||||
std::lock_guard<std::mutex> lk(mutex);
|
std::lock_guard<std::mutex> lk(mutex);
|
||||||
|
|
||||||
|
|
@ -629,7 +648,6 @@ void server_models::load(const std::string & name) {
|
||||||
LOG("[%5d] %s", port, buffer);
|
LOG("[%5d] %s", port, buffer);
|
||||||
std::string str(buffer);
|
std::string str(buffer);
|
||||||
if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_READY)) {
|
if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_READY)) {
|
||||||
// Query memory usage from the child's /props endpoint
|
|
||||||
if (!ready_received) {
|
if (!ready_received) {
|
||||||
ready_received = true;
|
ready_received = true;
|
||||||
try {
|
try {
|
||||||
|
|
@ -640,8 +658,7 @@ void server_models::load(const std::string & name) {
|
||||||
json props = json::parse(res->body);
|
json props = json::parse(res->body);
|
||||||
if (props.contains("memory_mb")) {
|
if (props.contains("memory_mb")) {
|
||||||
uint64_t memory_mb = props["memory_mb"].get<uint64_t>();
|
uint64_t memory_mb = props["memory_mb"].get<uint64_t>();
|
||||||
SRV_INF("model %s loaded, memory usage: %lu MB\n", name.c_str(), (unsigned long)memory_mb);
|
SRV_INF("model %s loaded, actual memory: %lu MB\n", name.c_str(), (unsigned long)memory_mb);
|
||||||
// Update memory_mb in meta
|
|
||||||
std::lock_guard<std::mutex> lk(this->mutex);
|
std::lock_guard<std::mutex> lk(this->mutex);
|
||||||
if (mapping.find(name) != mapping.end()) {
|
if (mapping.find(name) != mapping.end()) {
|
||||||
mapping[name].meta.memory_mb = memory_mb;
|
mapping[name].meta.memory_mb = memory_mb;
|
||||||
|
|
|
||||||
|
|
@ -62,7 +62,7 @@ struct server_model_meta {
|
||||||
int port = 0;
|
int port = 0;
|
||||||
server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
|
server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
|
||||||
int64_t last_used = 0; // for LRU unloading
|
int64_t last_used = 0; // for LRU unloading
|
||||||
uint64_t memory_mb = 0; // estimated memory usage in MB
|
uint64_t memory_mb = 0; // size in MB (estimate before load, actual after load)
|
||||||
std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
|
std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
|
||||||
int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
|
int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
|
||||||
int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown
|
int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown
|
||||||
|
|
@ -111,7 +111,7 @@ private:
|
||||||
void update_meta(const std::string & name, const server_model_meta & meta);
|
void update_meta(const std::string & name, const server_model_meta & meta);
|
||||||
|
|
||||||
// unload least recently used models if the limit is reached
|
// unload least recently used models if the limit is reached
|
||||||
void unload_lru();
|
void unload_lru(uint64_t new_model_memory_mb = 0);
|
||||||
|
|
||||||
// not thread-safe, caller must hold mutex
|
// not thread-safe, caller must hold mutex
|
||||||
void add_model(server_model_meta && meta);
|
void add_model(server_model_meta && meta);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue