From 6dc4b90635674e3a19402acb6828b90efdcc5a4a Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 23 Jul 2025 15:37:58 +0800 Subject: [PATCH] Fix NPU --- ggml/src/ggml-openvino/.clang-format | 2 +- ggml/src/ggml-openvino/ggml-decoder.cpp | 76 ++++++++++++------------- ggml/src/ggml-openvino/ggml-decoder.h | 14 +++-- ggml/src/ggml-openvino/utils.cpp | 16 +++--- 4 files changed, 56 insertions(+), 52 deletions(-) diff --git a/ggml/src/ggml-openvino/.clang-format b/ggml/src/ggml-openvino/.clang-format index d631bc6c01..18280772b6 100644 --- a/ggml/src/ggml-openvino/.clang-format +++ b/ggml/src/ggml-openvino/.clang-format @@ -7,7 +7,6 @@ PointerAlignment: Left Cpp11BracedListStyle: true AccessModifierOffset: -4 BinPackArguments: false -BinPackParameters: false BreakBeforeBraces: Attach Language: Cpp @@ -31,6 +30,7 @@ AllowShortIfStatementsOnASingleLine: Never AllowShortLambdasOnASingleLine: Inline AllowShortLoopsOnASingleLine: false AlwaysBreakBeforeMultilineStrings: true +BinPackParameters: true BitFieldColonSpacing: Both # BreakAdjacentStringLiterals: true BreakAfterAttributes: Never diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 8ce9354c69..b233ff8ebd 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -31,47 +31,45 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token, int context_size, int num_heads, int num_heads_kv, int head_size) : - GgmlOvDecoder::GgmlOvDecoder(node, cgraph, is_static, is_first_token) { - m_context_size = context_size; - m_num_heads = num_heads; - m_num_heads_kv = num_heads_kv; - m_head_size = head_size; -} - -GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, - bool is_first_token) : m_cgraph(cgraph), m_node(node), - m_op_name(m_node ? std::string(m_node->name) : "NONE_OP"), + m_op_name(std::string(node->name)), + m_context_size(context_size), + m_num_heads(num_heads), + m_num_heads_kv(num_heads_kv), + m_head_size(head_size), m_is_static(is_static), m_is_first_token(is_first_token) { - if (m_node) { - set_input_output(m_node); - } else { - if (is_first_token && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) { - print_tensor_address_map(cgraph); - } + set_input_output(node); +} - if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { - auto timestamp = (long long) ggml_time_us(); - std::string filename = "cgraph_" + std::to_string(timestamp) + ".txt"; - dump_cgraph(cgraph, filename); - } - - set_llm_params(); - - if (is_first_token) { - add_weight_const_parallel(m_model_weights); - } - - for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { - auto* cur_node = cgraph->nodes[node_n]; - m_nodes.push_back(cur_node); - set_input_output(cur_node); - } - - add_extra_inputs(); +GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph, + std::map>& model_weights, bool is_static, + bool is_first_token) : + m_cgraph(cgraph), + m_op_name(m_node ? std::string(m_node->name) : ""), + m_model_weights(model_weights), + m_is_static(is_static), + m_is_first_token(is_first_token) { + if (is_first_token && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) { + print_tensor_address_map(cgraph); } + + if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { + auto timestamp = (long long) ggml_time_us(); + std::string filename = "cgraph_" + std::to_string(timestamp) + ".txt"; + dump_cgraph(cgraph, filename); + } + + set_llm_params(); + + for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { + auto* cur_node = cgraph->nodes[node_n]; + m_nodes.push_back(cur_node); + set_input_output(cur_node); + } + + add_extra_inputs(); } GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph) { @@ -334,10 +332,11 @@ std::map GgmlOvDecoder::get_kv_param_res_names() const return kv_param_res_names; } -void GgmlOvDecoder::add_weight_const_parallel(std::map>& model_weights) { +std::map> GgmlOvDecoder::create_weight_nodes(struct ggml_cgraph* cgraph) { + std::map> model_weights; static std::mutex weights_mutex; - auto* nodes = m_cgraph->nodes; - auto n_nodes = m_cgraph->n_nodes; + auto* nodes = cgraph->nodes; + auto n_nodes = cgraph->n_nodes; std::for_each(std::execution::par, nodes, nodes + n_nodes, [&](ggml_tensor* node) { for (int i = 0; i < GGML_MAX_SRC; i++) { auto* src = node->src[i]; @@ -369,6 +368,7 @@ void GgmlOvDecoder::add_weight_const_parallel(std::map GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) { diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index f4fe9c402d..78422afaf7 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -11,12 +11,17 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { public: - GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token); + // Graph decoder + GgmlOvDecoder(struct ggml_cgraph* cgraph, std::map>& model_weights, + bool is_static, bool is_first_token); + + // Node decoder, called in GgmlOvDecoder::visit_subgraph GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token, int context_size, int num_heads, int num_heads_kv, int head_size); - // Naive decoder + // Naive graph decoder GgmlOvDecoder(struct ggml_cgraph* cgraph); + virtual ov::Any get_attribute(const std::string& name) const override { return nullptr; GGML_UNUSED(name); @@ -110,6 +115,8 @@ public: ov::PartialShape get_graph_input_shape(const ggml_tensor* src) const; + static std::shared_ptr create_weight_node(ggml_tensor* tensor); + static std::map> create_weight_nodes(struct ggml_cgraph* cgraph); void clear_model_weights() { m_model_weights.clear(); } private: @@ -123,9 +130,6 @@ private: // set context_size, num_heads, etc void set_llm_params(); - static std::shared_ptr create_weight_node(ggml_tensor* tensor); - void add_weight_const_parallel(std::map>& model_weights); - struct ggml_cgraph* m_cgraph = nullptr; ggml_tensor* m_node = nullptr; std::vector m_nodes; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index fcfd3639a7..be06c54e8b 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -26,10 +26,6 @@ #include "openvino/frontend.hpp" #include "openvino/input_model.hpp" -std::shared_ptr get_ggml_decoder(struct ggml_cgraph* cgraph, bool is_static, bool is_first_token) { - return std::make_shared(nullptr, cgraph, is_static, is_first_token); -} - ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, const std::string& name) { const auto* ggml_tensor = ggml_decoder->get_input_ggml_tensor(name); auto* input_data = ggml_tensor->data; @@ -111,7 +107,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c auto it = infer_request_cache.find(cgraph); if (it != infer_request_cache.end()) { - ggml_decoder = get_ggml_decoder(cgraph, is_static, false); + std::map> model_weights; + ggml_decoder = std::make_shared(cgraph, model_weights, is_static, false); decoder_end_time = ggml_time_us(); // For NPU for the first time we call kvcache modle, pop the compiled kvcache model from cache @@ -126,17 +123,20 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c compile_end_time = conversion_end_time; } else { std::shared_ptr model; + auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); if (is_static) { - ggml_decoder = get_ggml_decoder(cgraph, is_static, true); - auto ggml_decoder_kvcache = get_ggml_decoder(cgraph, is_static, false); + ggml_decoder = std::make_shared(cgraph, model_weights, is_static, true); + auto ggml_decoder_kvcache = std::make_shared(cgraph, model_weights, is_static, false); decoder_end_time = ggml_time_us(); auto input_model = std::make_shared(ggml_decoder); auto input_model_kvcache = std::make_shared(ggml_decoder_kvcache); model = ov::frontend::ggml::FrontEnd::convert(input_model); + ggml_decoder->clear_model_weights(); auto model_kvcache = ov::frontend::ggml::FrontEnd::convert(input_model_kvcache); + ggml_decoder_kvcache->clear_model_weights(); conversion_end_time = ggml_time_us(); auto compiled_model = core.compile_model(model, device, config); @@ -157,7 +157,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c ov::serialize(model_kvcache, timestamped_filename); } } else { - ggml_decoder = get_ggml_decoder(cgraph, is_static, true); + ggml_decoder = std::make_shared(cgraph, model_weights, is_static, true); decoder_end_time = ggml_time_us(); auto input_model = std::make_shared(ggml_decoder);