From f3c05190962cdf60e4e4d7311ef53dfe1c97b7fa Mon Sep 17 00:00:00 2001
From: "Yu, Zijun" <zijun.yu@intel.com>
Date: Fri, 11 Jul 2025 15:44:19 +0800
Subject: [PATCH] Reduce memory: free ov weights node after graph conversion

---
 ggml/src/ggml-openvino/ggml-decoder.cpp | 16 +++++-----------
 ggml/src/ggml-openvino/ggml-decoder.h   |  2 ++
 ggml/src/ggml-openvino/utils.cpp        |  4 +---
 3 files changed, 8 insertions(+), 14 deletions(-)
diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
index ae4beca23e..20d8c1b7fe 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -42,28 +42,23 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap
     m_op_name(m_node ? std::string(m_node->name) : "NONE_OP"),
     m_is_static(is_static),
     m_is_first_token(is_first_token) {
-    // TODO avoid static
-    static std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
     if (m_node) {
         set_input_output(m_node);
     } else {
-        static bool printed = false;
-        if (!printed && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) {
+        if (is_first_token && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) {
             print_tensor_address_map(cgraph);
-            printed = true;
         }
 
         if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) {
-            std::string filename = "cgraph.txt";
+            auto timestamp = (long long) ggml_time_us();
+            std::string filename = "cgraph_" + std::to_string(timestamp) + ".txt";
             dump_cgraph(cgraph, filename);
         }
 
         set_llm_params();
 
-        static bool weight_created = false;
-        if (!weight_created) {
-            add_weight_const_parallel(model_weights);
-            weight_created = true;
+        if (is_first_token) {
+            add_weight_const_parallel(m_model_weights);
         }
 
         for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
@@ -71,7 +66,6 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap
             m_nodes.push_back(cur_node);
             set_input_output(cur_node);
         }
-        m_model_weights = model_weights;
 
         add_extra_inputs();
     }
diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h
index 8b507438c5..428edef3ae 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.h
+++ b/ggml/src/ggml-openvino/ggml-decoder.h
@@ -108,6 +108,8 @@ public:
 
     ov::PartialShape get_graph_input_shape(const ggml_tensor* src) const;
 
+    void clear_model_weights() { m_model_weights.clear(); }
+
 private:
     void set_input_output(ggml_tensor* node);
     void add_extra_inputs();
diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp
index 2c4f0afe58..e5a4401fec 100644
--- a/ggml/src/ggml-openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/utils.cpp
@@ -9,10 +9,8 @@
 #include <memory>
 #include <openvino/core/any.hpp>
 #include <openvino/core/graph_util.hpp>
-#include <openvino/core/partial_shape.hpp>
 #include <openvino/core/type/float16.hpp>
 #include <openvino/frontend/manager.hpp>
-#include <openvino/op/parameter.hpp>
 #include <openvino/openvino.hpp>
 #include <openvino/runtime/compiled_model.hpp>
 #include <openvino/runtime/infer_request.hpp>
@@ -89,7 +87,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
     if (cache_dir && !is_static) {
         core.set_property(ov::cache_dir(cache_dir));
     }
-    // core.set_property(ov::enable_profiling(true));
 
     static std::unordered_map<struct ggml_cgraph*, std::shared_ptr<ov::InferRequest>> infer_request_cache;
     static std::unordered_map<struct ggml_cgraph*, std::vector<std::string>> ov_input_names_cache;
@@ -157,6 +154,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
 
             auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);
             model = ov::frontend::ggml::FrontEnd::convert(input_model);
+            ggml_decoder->clear_model_weights();
             conversion_end_time = ggml_time_us();
 
             auto compiled_model = core.compile_model(model, device, config);