Reduce memory: free ov weights node after graph conversion

This commit is contained in:
Yu, Zijun 2025-07-11 15:44:19 +08:00 committed by Mustafa Cavus
parent a80da69448
commit f3c0519096
3 changed files with 8 additions and 14 deletions

View File

@ -42,28 +42,23 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap
m_op_name(m_node ? std::string(m_node->name) : "NONE_OP"),
m_is_static(is_static),
m_is_first_token(is_first_token) {
// TODO avoid static
static std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
if (m_node) {
set_input_output(m_node);
} else {
static bool printed = false;
if (!printed && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) {
if (is_first_token && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) {
print_tensor_address_map(cgraph);
printed = true;
}
if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) {
std::string filename = "cgraph.txt";
auto timestamp = (long long) ggml_time_us();
std::string filename = "cgraph_" + std::to_string(timestamp) + ".txt";
dump_cgraph(cgraph, filename);
}
set_llm_params();
static bool weight_created = false;
if (!weight_created) {
add_weight_const_parallel(model_weights);
weight_created = true;
if (is_first_token) {
add_weight_const_parallel(m_model_weights);
}
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
@ -71,7 +66,6 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap
m_nodes.push_back(cur_node);
set_input_output(cur_node);
}
m_model_weights = model_weights;
add_extra_inputs();
}

View File

@ -108,6 +108,8 @@ public:
ov::PartialShape get_graph_input_shape(const ggml_tensor* src) const;
void clear_model_weights() { m_model_weights.clear(); }
private:
void set_input_output(ggml_tensor* node);
void add_extra_inputs();

View File

@ -9,10 +9,8 @@
#include <memory>
#include <openvino/core/any.hpp>
#include <openvino/core/graph_util.hpp>
#include <openvino/core/partial_shape.hpp>
#include <openvino/core/type/float16.hpp>
#include <openvino/frontend/manager.hpp>
#include <openvino/op/parameter.hpp>
#include <openvino/openvino.hpp>
#include <openvino/runtime/compiled_model.hpp>
#include <openvino/runtime/infer_request.hpp>
@ -89,7 +87,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
if (cache_dir && !is_static) {
core.set_property(ov::cache_dir(cache_dir));
}
// core.set_property(ov::enable_profiling(true));
static std::unordered_map<struct ggml_cgraph*, std::shared_ptr<ov::InferRequest>> infer_request_cache;
static std::unordered_map<struct ggml_cgraph*, std::vector<std::string>> ov_input_names_cache;
@ -157,6 +154,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);
model = ov::frontend::ggml::FrontEnd::convert(input_model);
ggml_decoder->clear_model_weights();
conversion_end_time = ggml_time_us();
auto compiled_model = core.compile_model(model, device, config);