From f3c05190962cdf60e4e4d7311ef53dfe1c97b7fa Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 11 Jul 2025 15:44:19 +0800 Subject: [PATCH] Reduce memory: free ov weights node after graph conversion --- ggml/src/ggml-openvino/ggml-decoder.cpp | 16 +++++----------- ggml/src/ggml-openvino/ggml-decoder.h | 2 ++ ggml/src/ggml-openvino/utils.cpp | 4 +--- 3 files changed, 8 insertions(+), 14 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index ae4beca23e..20d8c1b7fe 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -42,28 +42,23 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap m_op_name(m_node ? std::string(m_node->name) : "NONE_OP"), m_is_static(is_static), m_is_first_token(is_first_token) { - // TODO avoid static - static std::map> model_weights; if (m_node) { set_input_output(m_node); } else { - static bool printed = false; - if (!printed && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) { + if (is_first_token && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) { print_tensor_address_map(cgraph); - printed = true; } if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { - std::string filename = "cgraph.txt"; + auto timestamp = (long long) ggml_time_us(); + std::string filename = "cgraph_" + std::to_string(timestamp) + ".txt"; dump_cgraph(cgraph, filename); } set_llm_params(); - static bool weight_created = false; - if (!weight_created) { - add_weight_const_parallel(model_weights); - weight_created = true; + if (is_first_token) { + add_weight_const_parallel(m_model_weights); } for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { @@ -71,7 +66,6 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap m_nodes.push_back(cur_node); set_input_output(cur_node); } - m_model_weights = model_weights; add_extra_inputs(); } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 8b507438c5..428edef3ae 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -108,6 +108,8 @@ public: ov::PartialShape get_graph_input_shape(const ggml_tensor* src) const; + void clear_model_weights() { m_model_weights.clear(); } + private: void set_input_output(ggml_tensor* node); void add_extra_inputs(); diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 2c4f0afe58..e5a4401fec 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -9,10 +9,8 @@ #include #include #include -#include #include #include -#include #include #include #include @@ -89,7 +87,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c if (cache_dir && !is_static) { core.set_property(ov::cache_dir(cache_dir)); } - // core.set_property(ov::enable_profiling(true)); static std::unordered_map> infer_request_cache; static std::unordered_map> ov_input_names_cache; @@ -157,6 +154,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c auto input_model = std::make_shared(ggml_decoder); model = ov::frontend::ggml::FrontEnd::convert(input_model); + ggml_decoder->clear_model_weights(); conversion_end_time = ggml_time_us(); auto compiled_model = core.compile_model(model, device, config);