PERF: add weight constant in parallel

2025-05-14 17:48:56 +08:00 · 2025-05-14 17:48:56 +08:00 · a30dc6e726
parent c57f61494a
commit a30dc6e726
2 changed files with 47 additions and 0 deletions
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@ -3,9 +3,11 @@
 #include <ggml-impl.h>
 #include <ggml.h>

+#include <algorithm>
 #include <cassert>
 #include <cstdint>
 #include <cstdlib>
+#include <execution>
 #include <fstream>
 #include <iomanip>
 #include <map>
@ -42,6 +44,12 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap
            dump_cgraph(m_cgraph);
        }

+        static bool weight_created = false;
+        if (!getenv("GGML_OPENVINO_WEIGHT_AS_INPUT") && !weight_created) {
+            add_weight_const_parallel(model_weights);
+            weight_created = true;
+        }
+
        set_max_token_len();
        for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) {
            auto* cur_node = m_cgraph->nodes[node_n];
@ -235,6 +243,43 @@ void GgmlOvDecoder::add_extra_inputs() {
    }
 }

+void GgmlOvDecoder::add_weight_const_parallel(std::map<std::string, std::shared_ptr<ov::Node>>& model_weights) {
+    static std::mutex weights_mutex;
+    auto* nodes = m_cgraph->nodes;
+    auto n_nodes = m_cgraph->n_nodes;
+    std::for_each(std::execution::par, nodes, nodes + n_nodes, [&](ggml_tensor* node) {
+        for (int i = 0; i < GGML_MAX_SRC; i++) {
+            auto* src = node->src[i];
+            if (src == nullptr) {
+                continue;
+            }
+
+            std::string src_name(src->name);
+            if (!src->view_src) {
+                ggml_backend_buffer* buffer = src->buffer;
+                if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
+                    bool should_create = false;
+                    {
+                        std::lock_guard<std::mutex> lock(weights_mutex);
+                        if (model_weights.find(src_name) == model_weights.end()) {
+                            model_weights[src_name] = nullptr;
+                            should_create = true;
+                        }
+                    }
+                    if (should_create) {
+                        auto weight_node = create_weight_node(src);
+                        weight_node->set_friendly_name(src_name);
+                        {
+                            std::lock_guard<std::mutex> lock(weights_mutex);
+                            model_weights[src_name] = weight_node;
+                        }
+                    }
+                }
+            }
+        }
+    });
+}
+
 std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) {
    std::shared_ptr<ov::Node> weight_node;
    auto node_type = get_ov_type(tensor);
--- a/ggml/src/ggml-openvino/ggml-decoder.h
+++ b/ggml/src/ggml-openvino/ggml-decoder.h
@ -101,6 +101,8 @@ private:
    void set_max_token_len();
    int64_t m_max_token_len;

+    void add_weight_const_parallel(std::map<std::string, std::shared_ptr<ov::Node>>& model_weights);
+
    struct ggml_cgraph* m_cgraph;
    std::map<std::string, ggml_tensor*> m_inputs;
    std::vector<std::string> m_input_names;