Fix NPU

2025-07-23 15:37:58 +08:00 · 2025-07-23 15:37:58 +08:00 · 6dc4b90635
parent 44f4cf34b1
commit 6dc4b90635
4 changed files with 56 additions and 52 deletions
--- a/ggml/src/ggml-openvino/.clang-format
+++ b/ggml/src/ggml-openvino/.clang-format
@ -7,7 +7,6 @@ PointerAlignment: Left
 Cpp11BracedListStyle: true
 AccessModifierOffset: -4
 BinPackArguments: false
-BinPackParameters: false
 BreakBeforeBraces: Attach

 Language:        Cpp
@ -31,6 +30,7 @@ AllowShortIfStatementsOnASingleLine: Never
 AllowShortLambdasOnASingleLine: Inline
 AllowShortLoopsOnASingleLine: false
 AlwaysBreakBeforeMultilineStrings: true
+BinPackParameters: true
 BitFieldColonSpacing: Both
 # BreakAdjacentStringLiterals: true
 BreakAfterAttributes: Never
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@ -31,47 +31,45 @@

 GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token,
                             int context_size, int num_heads, int num_heads_kv, int head_size) :
-    GgmlOvDecoder::GgmlOvDecoder(node, cgraph, is_static, is_first_token) {
-    m_context_size = context_size;
-    m_num_heads = num_heads;
-    m_num_heads_kv = num_heads_kv;
-    m_head_size = head_size;
-}
-
-GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static,
-                             bool is_first_token) :
    m_cgraph(cgraph),
    m_node(node),
-    m_op_name(m_node ? std::string(m_node->name) : "NONE_OP"),
+    m_op_name(std::string(node->name)),
+    m_context_size(context_size),
+    m_num_heads(num_heads),
+    m_num_heads_kv(num_heads_kv),
+    m_head_size(head_size),
    m_is_static(is_static),
    m_is_first_token(is_first_token) {
-    if (m_node) {
-        set_input_output(m_node);
-    } else {
-        if (is_first_token && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) {
-            print_tensor_address_map(cgraph);
-        }
+    set_input_output(node);
+}

-        if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) {
-            auto timestamp = (long long) ggml_time_us();
-            std::string filename = "cgraph_" + std::to_string(timestamp) + ".txt";
-            dump_cgraph(cgraph, filename);
-        }
-
-        set_llm_params();
-
-        if (is_first_token) {
-            add_weight_const_parallel(m_model_weights);
-        }
-
-        for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
-            auto* cur_node = cgraph->nodes[node_n];
-            m_nodes.push_back(cur_node);
-            set_input_output(cur_node);
-        }
-
-        add_extra_inputs();
+GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph,
+                             std::map<std::string, std::shared_ptr<ov::Node>>& model_weights, bool is_static,
+                             bool is_first_token) :
+    m_cgraph(cgraph),
+    m_op_name(m_node ? std::string(m_node->name) : ""),
+    m_model_weights(model_weights),
+    m_is_static(is_static),
+    m_is_first_token(is_first_token) {
+    if (is_first_token && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) {
+        print_tensor_address_map(cgraph);
    }
+
+    if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) {
+        auto timestamp = (long long) ggml_time_us();
+        std::string filename = "cgraph_" + std::to_string(timestamp) + ".txt";
+        dump_cgraph(cgraph, filename);
+    }
+
+    set_llm_params();
+
+    for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
+        auto* cur_node = cgraph->nodes[node_n];
+        m_nodes.push_back(cur_node);
+        set_input_output(cur_node);
+    }
+
+    add_extra_inputs();
 }

 GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph) {
@ -334,10 +332,11 @@ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const
    return kv_param_res_names;
 }

-void GgmlOvDecoder::add_weight_const_parallel(std::map<std::string, std::shared_ptr<ov::Node>>& model_weights) {
+std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(struct ggml_cgraph* cgraph) {
+    std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
    static std::mutex weights_mutex;
-    auto* nodes = m_cgraph->nodes;
-    auto n_nodes = m_cgraph->n_nodes;
+    auto* nodes = cgraph->nodes;
+    auto n_nodes = cgraph->n_nodes;
    std::for_each(std::execution::par, nodes, nodes + n_nodes, [&](ggml_tensor* node) {
        for (int i = 0; i < GGML_MAX_SRC; i++) {
            auto* src = node->src[i];
@ -369,6 +368,7 @@ void GgmlOvDecoder::add_weight_const_parallel(std::map<std::string, std::shared_
            }
        }
    });
+    return model_weights;
 }

 std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) {
--- a/ggml/src/ggml-openvino/ggml-decoder.h
+++ b/ggml/src/ggml-openvino/ggml-decoder.h
@ -11,12 +11,17 @@

 class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
 public:
-    GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token);
+    // Graph decoder
+    GgmlOvDecoder(struct ggml_cgraph* cgraph, std::map<std::string, std::shared_ptr<ov::Node>>& model_weights,
+                  bool is_static, bool is_first_token);
+
+    // Node decoder, called in GgmlOvDecoder::visit_subgraph
    GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token,
                  int context_size, int num_heads, int num_heads_kv, int head_size);

-    // Naive decoder
+    // Naive graph decoder
    GgmlOvDecoder(struct ggml_cgraph* cgraph);
+
    virtual ov::Any get_attribute(const std::string& name) const override {
        return nullptr;
        GGML_UNUSED(name);
@ -110,6 +115,8 @@ public:

    ov::PartialShape get_graph_input_shape(const ggml_tensor* src) const;

+    static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor);
+    static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(struct ggml_cgraph* cgraph);
    void clear_model_weights() { m_model_weights.clear(); }

 private:
@ -123,9 +130,6 @@ private:
    // set context_size, num_heads, etc
    void set_llm_params();

-    static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor);
-    void add_weight_const_parallel(std::map<std::string, std::shared_ptr<ov::Node>>& model_weights);
-
    struct ggml_cgraph* m_cgraph = nullptr;
    ggml_tensor* m_node = nullptr;
    std::vector<ggml_tensor*> m_nodes;
--- a/ggml/src/ggml-openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/utils.cpp
@ -26,10 +26,6 @@
 #include "openvino/frontend.hpp"
 #include "openvino/input_model.hpp"

-std::shared_ptr<GgmlOvDecoder> get_ggml_decoder(struct ggml_cgraph* cgraph, bool is_static, bool is_first_token) {
-    return std::make_shared<GgmlOvDecoder>(nullptr, cgraph, is_static, is_first_token);
-}
-
 ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string& name) {
    const auto* ggml_tensor = ggml_decoder->get_input_ggml_tensor(name);
    auto* input_data = ggml_tensor->data;
@ -111,7 +107,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c

    auto it = infer_request_cache.find(cgraph);
    if (it != infer_request_cache.end()) {
-        ggml_decoder = get_ggml_decoder(cgraph, is_static, false);
+        std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
+        ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, false);
        decoder_end_time = ggml_time_us();

        // For NPU for the first time we call kvcache modle, pop the compiled kvcache model from cache
@ -126,17 +123,20 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
        compile_end_time = conversion_end_time;
    } else {
        std::shared_ptr<ov::Model> model;
+        auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);

        if (is_static) {
-            ggml_decoder = get_ggml_decoder(cgraph, is_static, true);
-            auto ggml_decoder_kvcache = get_ggml_decoder(cgraph, is_static, false);
+            ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, true);
+            auto ggml_decoder_kvcache = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, false);
            decoder_end_time = ggml_time_us();

            auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);
            auto input_model_kvcache = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder_kvcache);

            model = ov::frontend::ggml::FrontEnd::convert(input_model);
+            ggml_decoder->clear_model_weights();
            auto model_kvcache = ov::frontend::ggml::FrontEnd::convert(input_model_kvcache);
+            ggml_decoder_kvcache->clear_model_weights();
            conversion_end_time = ggml_time_us();

            auto compiled_model = core.compile_model(model, device, config);
@ -157,7 +157,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
                ov::serialize(model_kvcache, timestamped_filename);
            }
        } else {
-            ggml_decoder = get_ggml_decoder(cgraph, is_static, true);
+            ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, true);
            decoder_end_time = ggml_time_us();

            auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);