Reduce memory: free ov weights node after graph conversion

2025-07-11 15:44:19 +08:00 · 2025-07-11 15:44:19 +08:00 · f3c0519096
parent a80da69448
commit f3c0519096
3 changed files with 8 additions and 14 deletions
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@ -42,28 +42,23 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap
    m_op_name(m_node ? std::string(m_node->name) : "NONE_OP"),
    m_is_static(is_static),
    m_is_first_token(is_first_token) {
-    // TODO avoid static
-    static std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
    if (m_node) {
        set_input_output(m_node);
    } else {
-        static bool printed = false;
-        if (!printed && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) {
+        if (is_first_token && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) {
            print_tensor_address_map(cgraph);
-            printed = true;
        }

        if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) {
-            std::string filename = "cgraph.txt";
+            auto timestamp = (long long) ggml_time_us();
+            std::string filename = "cgraph_" + std::to_string(timestamp) + ".txt";
            dump_cgraph(cgraph, filename);
        }

        set_llm_params();

-        static bool weight_created = false;
-        if (!weight_created) {
-            add_weight_const_parallel(model_weights);
-            weight_created = true;
+        if (is_first_token) {
+            add_weight_const_parallel(m_model_weights);
        }

        for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
@ -71,7 +66,6 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap
            m_nodes.push_back(cur_node);
            set_input_output(cur_node);
        }
-        m_model_weights = model_weights;

        add_extra_inputs();
    }
--- a/ggml/src/ggml-openvino/ggml-decoder.h
+++ b/ggml/src/ggml-openvino/ggml-decoder.h
@ -108,6 +108,8 @@ public:

    ov::PartialShape get_graph_input_shape(const ggml_tensor* src) const;

+    void clear_model_weights() { m_model_weights.clear(); }
+
 private:
    void set_input_output(ggml_tensor* node);
    void add_extra_inputs();
--- a/ggml/src/ggml-openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/utils.cpp
@ -9,10 +9,8 @@
 #include <memory>
 #include <openvino/core/any.hpp>
 #include <openvino/core/graph_util.hpp>
-#include <openvino/core/partial_shape.hpp>
 #include <openvino/core/type/float16.hpp>
 #include <openvino/frontend/manager.hpp>
-#include <openvino/op/parameter.hpp>
 #include <openvino/openvino.hpp>
 #include <openvino/runtime/compiled_model.hpp>
 #include <openvino/runtime/infer_request.hpp>
@ -89,7 +87,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
    if (cache_dir && !is_static) {
        core.set_property(ov::cache_dir(cache_dir));
    }
-    // core.set_property(ov::enable_profiling(true));

    static std::unordered_map<struct ggml_cgraph*, std::shared_ptr<ov::InferRequest>> infer_request_cache;
    static std::unordered_map<struct ggml_cgraph*, std::vector<std::string>> ov_input_names_cache;
@ -157,6 +154,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c

            auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);
            model = ov::frontend::ggml::FrontEnd::convert(input_model);
+            ggml_decoder->clear_model_weights();
            conversion_end_time = ggml_time_us();

            auto compiled_model = core.compile_model(model, device, config);