From f5e8bfddc3d19a424b5d038afda8c7f9425945de Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 30 Dec 2025 15:57:21 +0100 Subject: [PATCH 1/2] lora: make sure model keep track of associated adapters --- src/llama-adapter.cpp | 11 ++++++----- src/llama-context.cpp | 4 +++- src/llama-model.cpp | 6 +++++- src/llama-model.h | 5 +++-- 4 files changed, 17 insertions(+), 9 deletions(-) diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index bdc24c2d6b..5ff22b18ab 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -413,8 +413,8 @@ static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_l } } - // update number of nodes used - model.n_lora_nodes += adapter.get_n_nodes(); + // register adapter with model + model.loras.insert(&adapter); LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2); } @@ -474,9 +474,10 @@ int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter, } void llama_adapter_lora_free(llama_adapter_lora * adapter) { - // update number of nodes used - GGML_ASSERT(adapter->model.n_lora_nodes >= adapter->get_n_nodes()); - adapter->model.n_lora_nodes -= adapter->get_n_nodes(); + // remove adapter from associated model + auto & model = adapter->model; + GGML_ASSERT(model.loras.find(adapter) != model.loras.end()); + model.loras.erase(adapter); delete adapter; } diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 34dfcd4724..bcea01a997 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1443,7 +1443,9 @@ uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const { return std::max(n_tokens * 40, 32u * model.n_tensors()); } uint32_t res = std::max(1024u, 8u*model.n_tensors()); - res += model.n_lora_nodes; + for (const auto & lora : model.loras) { + res += lora->get_n_nodes(); + } return res; } diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 5e664c8c57..1b220af83e 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -467,7 +467,11 @@ llama_model::llama_model(const llama_model_params & params) : params(params), pi pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern; } -llama_model::~llama_model() = default; +llama_model::~llama_model() { + for (auto * lora : loras) { + llama_adapter_lora_free(lora); + } +} void llama_model::load_stats(llama_model_loader & ml) { pimpl->n_elements = ml.n_elements; diff --git a/src/llama-model.h b/src/llama-model.h index f4f44a92b6..838d9cd6e5 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -12,6 +12,7 @@ #include #include #include +#include struct llama_cparams; struct llama_ubatch; @@ -475,8 +476,8 @@ struct llama_model { // for quantize-stats only std::vector> tensors_by_name; - // for keeping track of extra nodes used by lora adapters - uint32_t n_lora_nodes = 0; + // for keeping track of associated LoRA adapters + std::set loras; int64_t t_load_us = 0; int64_t t_start_us = 0; From cb5e0f8734c8286643f9fe818b51c3487f7e0fec Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 31 Dec 2025 12:07:07 +0100 Subject: [PATCH 2/2] deprecate llama_adapter_lora_free --- include/llama-cpp.h | 4 +++- include/llama.h | 3 ++- src/llama-adapter.cpp | 17 +++++------------ src/llama-adapter.h | 4 +--- src/llama-model.cpp | 2 +- 5 files changed, 12 insertions(+), 18 deletions(-) diff --git a/include/llama-cpp.h b/include/llama-cpp.h index 8f6368177d..807e77f628 100644 --- a/include/llama-cpp.h +++ b/include/llama-cpp.h @@ -21,7 +21,9 @@ struct llama_sampler_deleter { }; struct llama_adapter_lora_deleter { - void operator()(llama_adapter_lora * adapter) { llama_adapter_lora_free(adapter); } + void operator()(llama_adapter_lora *) { + // llama_adapter_lora_free is deprecated + } }; typedef std::unique_ptr llama_model_ptr; diff --git a/include/llama.h b/include/llama.h index 8b3c8a7b10..fcefb5ab4e 100644 --- a/include/llama.h +++ b/include/llama.h @@ -633,7 +633,8 @@ extern "C" { // Manually free a LoRA adapter // NOTE: loaded adapters will be free when the associated model is deleted - LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter); + LLAMA_API DEPRECATED(void llama_adapter_lora_free(struct llama_adapter_lora * adapter), + "adapters are now freed together with the associated model"); // Get the invocation tokens if the current lora is an alora LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter); diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index 5ff22b18ab..d6a5800e63 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -146,11 +146,9 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) { return nullptr; } -static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_lora & adapter) { +static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) { LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora); - llama_model & model = adapter.model; - ggml_context * ctx_init; gguf_init_params meta_gguf_params = { /* .no_alloc = */ true, @@ -420,10 +418,10 @@ static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_l } llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) { - llama_adapter_lora * adapter = new llama_adapter_lora(*model); + llama_adapter_lora * adapter = new llama_adapter_lora(); try { - llama_adapter_lora_init_impl(path_lora, *adapter); + llama_adapter_lora_init_impl(*model, path_lora, *adapter); return adapter; } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); @@ -473,13 +471,8 @@ int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter, return snprintf(buf, buf_size, "%s", it->second.c_str()); } -void llama_adapter_lora_free(llama_adapter_lora * adapter) { - // remove adapter from associated model - auto & model = adapter->model; - GGML_ASSERT(model.loras.find(adapter) != model.loras.end()); - model.loras.erase(adapter); - - delete adapter; +void llama_adapter_lora_free(llama_adapter_lora *) { + // deprecated: adapters are freed by llama_model's destructor } uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) { diff --git a/src/llama-adapter.h b/src/llama-adapter.h index 42d64a6e0b..d275d25425 100644 --- a/src/llama-adapter.h +++ b/src/llama-adapter.h @@ -59,8 +59,6 @@ struct llama_adapter_lora_weight { }; struct llama_adapter_lora { - llama_model & model; - // map tensor name to lora_a_b std::unordered_map ab_map; @@ -75,7 +73,7 @@ struct llama_adapter_lora { // activated lora (aLoRA) std::vector alora_invocation_tokens; - llama_adapter_lora(llama_model & model) : model(model) {} + llama_adapter_lora() = default; ~llama_adapter_lora() = default; llama_adapter_lora_weight * get_weight(ggml_tensor * w); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 1b220af83e..0f2af5256a 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -469,7 +469,7 @@ llama_model::llama_model(const llama_model_params & params) : params(params), pi llama_model::~llama_model() { for (auto * lora : loras) { - llama_adapter_lora_free(lora); + delete lora; } }