From 4c582ac7a313a27639ab1b06d590b4b80b565864 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 26 Jun 2025 13:54:06 +0800 Subject: [PATCH] Statful transformation for CPU GPU --- ggml/src/ggml-openvino/ggml-decoder.cpp | 104 +++++++++++------- ggml/src/ggml-openvino/ggml-decoder.h | 40 ++++--- ggml/src/ggml-openvino/openvino/decoder.hpp | 6 + ggml/src/ggml-openvino/openvino/op/cpy.cpp | 13 +-- .../openvino/translate_session.cpp | 69 +++++++++--- .../openvino/translate_session.hpp | 2 +- ggml/src/ggml-openvino/utils.cpp | 102 ++++++++++------- 7 files changed, 216 insertions(+), 120 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 04f68a4950..e30f026e36 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -26,12 +26,13 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" -GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token) - : m_cgraph(cgraph), - m_node(node), - m_op_name(m_node ? std::string(m_node->name) : "NONE_OP"), - m_is_static(is_static), - m_is_first_token(is_first_token) { +GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* m_cgraph, bool is_static, + bool is_first_token) : + m_cgraph(m_cgraph), + m_node(node), + m_op_name(m_node ? std::string(m_node->name) : "NONE_OP"), + m_is_static(is_static), + m_is_first_token(is_first_token) { static std::map> model_weights; if (m_node) { @@ -44,10 +45,11 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap } if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { - dump_cgraph(m_cgraph); + std::string filename = "cgraph.txt"; + dump_cgraph(m_cgraph, filename); } - set_max_token_len(); + set_llm_params(); static bool weight_created = false; if (!weight_created) { @@ -105,33 +107,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) { if (m_model_inputs.find(src_name) != m_model_inputs.end()) { continue; } - ov::PartialShape input_shape; - if (std::string(src->name) == "inp_tokens" || std::string(src->name) == "inp_pos") { - if (m_is_static) { - if (m_is_first_token) { - input_shape = ov::PartialShape{1, 1, m_max_token_len}; - } else { - input_shape = ov::PartialShape{1, 1, 1}; - } - } else { - input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_max_token_len)}; - } - } else if (std::string(src->name) == "KQ_mask") { - if (m_is_static) { - if (m_is_first_token) { - input_shape = ov::PartialShape{1, m_max_token_len, m_max_token_len}; - } else { - input_shape = ov::PartialShape{1, 1, m_max_token_len}; - } - } else { - auto max_mask_size = GGML_PAD(m_max_token_len, GGML_KQ_MASK_PAD); - input_shape = - ov::PartialShape{1, ov::Dimension(1, max_mask_size), ov::Dimension(1, max_mask_size)}; - } - } else { - input_shape = ov::Shape{get_shape(src)}; - } - auto param_node = std::make_shared(get_ov_type(src), input_shape); + auto param_node = std::make_shared(get_ov_type(src), get_graph_input_shape(src)); param_node->set_friendly_name(src_name); m_model_inputs[src_name] = param_node; } @@ -150,6 +126,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) { auto it = std::find(m_model_output_names.begin(), m_model_output_names.end(), name); if (it == m_model_output_names.end()) { m_model_output_names.push_back(name); + m_kv_names.push_back(name); } } } @@ -213,17 +190,54 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) { } } -void GgmlOvDecoder::set_max_token_len() { +void GgmlOvDecoder::set_llm_params() { for (int i = 0; i < m_cgraph->n_nodes; i++) { auto* node = m_cgraph->nodes[i]; - if (std::string(node->name) == "cache_k_l0 (view)") { + if (node->op == GGML_OP_VIEW && std::string(node->name) == "cache_k_l0 (view)") { auto* cache_k = node->src[0]; m_max_token_len = cache_k->ne[1]; - break; + } else if (node->op == GGML_OP_ROPE && std::string(node->name) == "Qcur-0") { + m_head_size = node->ne[0]; + m_num_heads = node->ne[1]; + } else if (node->op == GGML_OP_ROPE && std::string(node->name) == "Kcur-0") { + m_num_heads_kv = node->ne[1]; } } } +ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) const { + ov::PartialShape input_shape; + if (std::string(src->name) == "inp_tokens" || std::string(src->name) == "inp_pos") { + if (m_is_static) { + if (m_is_first_token) { + input_shape = ov::PartialShape{ 1, 1, m_max_token_len }; + } else { + input_shape = ov::PartialShape{ 1, 1, 1 }; + } + } else { + input_shape = ov::PartialShape{ 1, 1, ov::Dimension(1, m_max_token_len) }; + } + } else if (std::string(src->name) == "KQ_mask") { + if (m_is_static) { + if (m_is_first_token) { + input_shape = ov::PartialShape{ 1, m_max_token_len, m_max_token_len }; + } else { + input_shape = ov::PartialShape{ 1, 1, m_max_token_len }; + } + } else { + auto max_mask_size = GGML_PAD(m_max_token_len, GGML_KQ_MASK_PAD); + input_shape = ov::PartialShape{ 1, ov::Dimension(1, max_mask_size), ov::Dimension(1, max_mask_size) }; + } + } else if (std::string(src->name).find("cache_k") == 0) { + input_shape = ov::PartialShape{ m_max_token_len, m_num_heads_kv, m_head_size }; + } else if (std::string(src->name).find("cache_v") == 0) { + input_shape = ov::PartialShape{ m_num_heads_kv, m_head_size, m_max_token_len }; + } else { + input_shape = ov::PartialShape{ get_shape(src) }; + } + return input_shape; +} + void GgmlOvDecoder::add_extra_inputs() { int64_t past_token_len = -1; // attention_size not used for NPU @@ -267,6 +281,16 @@ void GgmlOvDecoder::add_extra_inputs() { } } +std::map GgmlOvDecoder::get_kv_param_res_names() const { + std::map kv_param_res_names; + for (const auto& name : m_kv_names) { + if (name.find("cache_k") == 0 || name.find("cache_v") == 0) { + kv_param_res_names[name] = name; + } + } + return kv_param_res_names; +} + void GgmlOvDecoder::add_weight_const_parallel(std::map>& model_weights) { static std::mutex weights_mutex; auto* nodes = m_cgraph->nodes; @@ -344,8 +368,8 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) return weight_node; } -void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph) { - std::ofstream file("cgraph.txt"); +void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename) { + std::ofstream file(filename); if (!file.is_open()) { std::cerr << "Failed to open file" << std::endl; return; diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index b6b13d1f11..6d3f24b093 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include "ggml.h" @@ -89,28 +90,34 @@ public: return m_model_output_names; } - virtual bool is_static() const override { - return m_is_static; - } - virtual bool is_first_token() const override { - return m_is_first_token; - } - virtual int get_max_token_len() const override { - return m_max_token_len; - } + virtual int get_max_token_len() const override { return m_max_token_len; } + + virtual int get_num_heads() const override { return m_num_heads; } + + virtual int get_num_heads_kv() const override { return m_num_heads_kv; } + + virtual int get_head_size() const override { return m_head_size; } + + virtual std::map get_kv_param_res_names() const override; + + virtual bool is_static() const override { return m_is_static; } + + virtual bool is_first_token() const override { return m_is_first_token; } + + ov::PartialShape get_graph_input_shape(const ggml_tensor* src) const; private: void set_input_output(ggml_tensor* node); void add_extra_inputs(); - static void dump_cgraph(const struct ggml_cgraph* cgraph); + static void dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename); static std::vector get_shape(const ggml_tensor* tensor); static std::vector get_stride(const ggml_tensor* tensor); static ov::element::Type get_ov_type(const ggml_tensor* tensor); + + // set max_token_len, num_heads, etc + void set_llm_params(); + static std::shared_ptr create_weight_node(ggml_tensor* tensor); - - void set_max_token_len(); - int m_max_token_len; - void add_weight_const_parallel(std::map>& model_weights); struct ggml_cgraph* m_cgraph; @@ -129,6 +136,11 @@ private: std::map> m_model_extra_input_values; std::map> m_model_weights; std::vector m_model_output_names; + int m_max_token_len; + int m_num_heads; + int m_num_heads_kv; + int m_head_size; + std::vector m_kv_names; bool m_is_static; bool m_is_first_token; }; diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index 6212568399..3105d0f16f 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -4,6 +4,7 @@ #include #include #include +#include namespace ov { namespace frontend { @@ -57,6 +58,11 @@ public: virtual const std::map>& get_model_weights() const = 0; virtual const std::vector& get_model_output_names() const = 0; + virtual int get_num_heads() const = 0; + virtual int get_num_heads_kv() const = 0; + virtual int get_head_size() const = 0; + virtual std::map get_kv_param_res_names() const = 0; + virtual bool is_static() const = 0; virtual bool is_first_token() const = 0; virtual int get_max_token_len() const = 0; diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp index d27f4babb4..b183b97f23 100644 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -57,13 +58,6 @@ OutputVector translate_cpy(const NodeContext& context) { if (op_case == 1) { // Write K to cache_k - int64_t head_size = src0_shape[2]; - int64_t num_heads = src0_shape[1]; - - auto reshaped_src1_shape = - ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{-1, num_heads, head_size}); - auto reshaped_src1 = std::make_shared(src1, reshaped_src1_shape, false); - auto token_len = get_dimensions(src0.get_node_shared_ptr(), {0}); auto token_len_scalar = std::make_shared(token_len, zero); @@ -80,7 +74,8 @@ OutputVector translate_cpy(const NodeContext& context) { } indices = std::make_shared(indices, one); - res = std::make_shared(reshaped_src1, indices, src0); + auto updated = std::make_shared(src1, indices, src0); + res = std::make_shared(updated, std::make_shared(src1), false); } else { // Write V to cache_v auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); @@ -140,7 +135,7 @@ OutputVector translate_cpy(const NodeContext& context) { false); auto updated = std::make_shared(reshaped_src1, indices_final, flattend_src0); - res = std::make_shared(updated, zero); + res = std::make_shared(updated, std::make_shared(src1), false); } return rename_outputs_with_suffix({res}, context.get_name()); diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 8eda23c1c5..3bf0403a64 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -1,7 +1,12 @@ #include "translate_session.hpp" #include +#include +#include +#include +#include #include +#include #include "input_model.hpp" @@ -11,6 +16,41 @@ namespace ggml { using namespace ov::op; +namespace { +ov::pass::MakeStateful::ParamResPairs get_kv_param_res_pairs( + const std::shared_ptr& model, const std::map& kv_param_res_names) { + ov::pass::MakeStateful::ParamResPairs pairs; + const auto& params = model->get_parameters(); + const auto& results = model->get_results(); + + for (const auto& param_res : kv_param_res_names) { + const auto& param_name = param_res.first; + const auto& res_name = param_res.second; + + auto param_it = std::find_if(params.begin(), params.end(), [&](const std::shared_ptr& node) { + return node->get_friendly_name() == param_name; + }); + + OPENVINO_ASSERT(param_it != params.end(), "The tensor name ", param_name, + " is not associated with any of " + "Parameters in the network."); + + auto res_it = std::find_if(results.begin(), results.end(), [&](const std::shared_ptr& node) { + return node->get_friendly_name() == res_name; + }); + + OPENVINO_ASSERT(res_it != results.end(), "The tensor name ", res_name, + " is not associated with any of " + "Results in the network."); + + std::shared_ptr param = *param_it; + std::shared_ptr res = *res_it; + pairs.emplace_back(param, res); + } + return pairs; +} +} // namespace + TranslateSession::TranslateSession(const frontend::InputModel::Ptr& input_model, const std::unordered_map& translator_map) : m_input_model(input_model), @@ -88,25 +128,26 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo results.push_back(result); } - ov::ParameterVector used_params; - for (const auto& param : params) { - if (!param->output(0).get_target_inputs().empty()) { - used_params.push_back(param); - } - } - if (getenv("GGML_OPENVINO_PROFILING")) { - if (auto diff = params.size() - used_params.size()) { - std::cout << diff << " parameters are not used in the model." << std::endl; - } - } - resulting_model = std::make_shared(results, used_params); + resulting_model = std::make_shared(results, params); + + apply_transformations(resulting_model); + return resulting_model; +} + +void TranslateSession::apply_transformations(const std::shared_ptr& model) { + auto ggml_model_decoder = std::dynamic_pointer_cast(m_input_model)->get_model_decoder(); ov::pass::Manager manager; manager.set_per_pass_validation(true); manager.register_pass(); - manager.run_passes(resulting_model); - return resulting_model; + if (!ggml_model_decoder->is_static()) { + const auto kv_param_res_names = ggml_model_decoder->get_kv_param_res_names(); + const auto kv_param_res_pairs = get_kv_param_res_pairs(model, kv_param_res_names); + manager.register_pass(kv_param_res_pairs); + } + + manager.run_passes(model); } } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/translate_session.hpp b/ggml/src/ggml-openvino/openvino/translate_session.hpp index 5c7a9d464d..9167b55fe5 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.hpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.hpp @@ -16,7 +16,7 @@ public: std::shared_ptr translate_graph(const frontend::InputModel::Ptr& input_model); private: - void print_model_topology(); + void apply_transformations(const std::shared_ptr& model); const frontend::InputModel::Ptr m_input_model; const std::unordered_map& m_translator_map; std::shared_ptr m_ov_model; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index d20e671064..2620fa5615 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -9,10 +9,13 @@ #include #include #include +#include #include #include +#include #include #include +#include #include #include #include @@ -28,11 +31,15 @@ std::shared_ptr get_ggml_decoder(struct ggml_cgraph* cgraph, bool } ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, const std::string& name) { - auto* input_data = ggml_decoder->get_input_ggml_tensor(name)->data; - ov::Tensor input_tensor; - ov::Shape input_shape = ggml_decoder->get_input_shape(name).to_shape(); - std::vector input_stride = ggml_decoder->get_input_stride(name); - input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); + const auto* ggml_tensor = ggml_decoder->get_input_ggml_tensor(name); + auto* input_data = ggml_tensor->data; + ov::Shape input_shape; + if (name.find("cache_k") == 0 || name.find("cache_v") == 0) { + input_shape = ggml_decoder->get_graph_input_shape(ggml_tensor).to_shape(); + } else { + input_shape = ggml_decoder->get_input_shape(name).to_shape(); + } + auto input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); return input_tensor; } @@ -82,41 +89,37 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c core.set_property(ov::cache_dir(cache_dir)); } - // CPU and GPU will only use cache_prefill - using CachedItem = std::pair, ov::CompiledModel>; - static std::unordered_map compiled_cache_prefill; - static std::unordered_map compiled_cache_kvcache; + static std::unordered_map> infer_request_cache; + static std::unordered_map> ov_input_names_cache; + static std::unordered_map> ov_output_names_cache; + // For NPU, store the kvcache model, since we cannot create two infer_request + static std::unordered_map compiled_model_cache; std::shared_ptr ggml_decoder; - std::shared_ptr model; - ov::CompiledModel compiled_model; + ov::InferRequest infer_request; int64_t decoder_end_time; int64_t conversion_end_time; int64_t compile_end_time; - bool is_first_token = is_prefill(cgraph); - - auto it = compiled_cache_prefill.find(cgraph); - if (it != compiled_cache_prefill.end()) { + auto it = infer_request_cache.find(cgraph); + if (it != infer_request_cache.end()) { ggml_decoder = get_ggml_decoder(cgraph, is_static, false); decoder_end_time = ggml_time_us(); - if (is_static) { - if (is_first_token) { - model = compiled_cache_prefill[cgraph].first; - compiled_model = compiled_cache_prefill[cgraph].second; - } else { - model = compiled_cache_kvcache[cgraph].first; - compiled_model = compiled_cache_kvcache[cgraph].second; - } - } else { - model = it->second.first; - compiled_model = it->second.second; + // For NPU for the first time we call kvcache modle, pop the compiled kvcache model from cache + if (is_static && compiled_model_cache.find(cgraph) != compiled_model_cache.end()) { + infer_request_cache[cgraph] = + std::make_shared(compiled_model_cache[cgraph].create_infer_request()); + compiled_model_cache.erase(cgraph); } + infer_request = *infer_request_cache[cgraph]; + conversion_end_time = ggml_time_us(); compile_end_time = conversion_end_time; } else { + std::shared_ptr model; + if (is_static) { ggml_decoder = get_ggml_decoder(cgraph, is_static, true); auto ggml_decoder_kvcache = get_ggml_decoder(cgraph, is_static, false); @@ -129,12 +132,14 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c auto model_kvcache = ov::frontend::ggml::FrontEnd::convert(input_model_kvcache); conversion_end_time = ggml_time_us(); - compiled_model = core.compile_model(model, device, config); + auto compiled_model = core.compile_model(model, device, config); auto compiled_model_kvcache = core.compile_model(model_kvcache, device, config); + compiled_model_cache[cgraph] = compiled_model_kvcache; compile_end_time = ggml_time_us(); - compiled_cache_prefill[cgraph] = std::make_pair(model, compiled_model); - compiled_cache_kvcache[cgraph] = std::make_pair(model_kvcache, compiled_model_kvcache); + infer_request_cache[cgraph] = std::make_shared(compiled_model.create_infer_request()); + infer_request = *infer_request_cache[cgraph]; + compiled_model_cache[cgraph] = compiled_model_kvcache; if (getenv("GGML_OPENVINO_DUMP_IR")) { char timestamped_filename[64]; @@ -152,9 +157,10 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c model = ov::frontend::ggml::FrontEnd::convert(input_model); conversion_end_time = ggml_time_us(); - compiled_model = core.compile_model(model, device, config); + auto compiled_model = core.compile_model(model, device, config); compile_end_time = ggml_time_us(); - compiled_cache_prefill[cgraph] = std::make_pair(model, compiled_model); + infer_request_cache[cgraph] = std::make_shared(compiled_model.create_infer_request()); + infer_request = *infer_request_cache[cgraph]; if (getenv("GGML_OPENVINO_DUMP_IR")) { char timestamped_filename[64]; @@ -163,12 +169,23 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c ov::serialize(model, timestamped_filename); } } - } - auto infer_request = compiled_model.create_infer_request(); - auto ov_params = model->get_parameters(); - for (size_t i = 0; i < ov_params.size(); i++) { - auto param_name = ov_params[i]->get_friendly_name(); + std::vector ov_input_names; + std::vector ov_output_names; + for (const auto& ov_param : model->get_parameters()) { + ov_input_names.push_back(ov_param->get_friendly_name()); + } + for (const auto& ov_output : model->get_results()) { + ov_output_names.push_back(ov_output->get_friendly_name()); + } + ov_input_names_cache[cgraph] = ov_input_names; + ov_output_names_cache[cgraph] = ov_output_names; + } + + auto ov_input_names = ov_input_names_cache[cgraph]; + auto ov_output_names = ov_output_names_cache[cgraph]; + for (size_t i = 0; i < ov_input_names.size(); i++) { + auto param_name = ov_input_names[i]; auto input_tensor = get_ov_input_tensor(ggml_decoder, param_name); infer_request.set_input_tensor(i, input_tensor); @@ -181,14 +198,15 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c infer_request.infer(); auto infer_end_time = ggml_time_us(); - auto output_names = ggml_decoder->get_model_output_names(); - auto output_tensors = get_ggml_graph_output_dst(ggml_decoder); - for (size_t i = 0; i < output_names.size(); i++) { - auto output_tensor = infer_request.get_output_tensor(i); - std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size()); + auto gguf_tensor_addrs = get_ggml_graph_output_dst(ggml_decoder); + for (size_t i = 0; i < ov_output_names.size(); i++) { + auto result_name = ov_output_names[i]; + const auto output_tensor = infer_request.get_output_tensor(i); + + std::memcpy(gguf_tensor_addrs[result_name], output_tensor.data(), output_tensor.get_byte_size()); if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) { - print_output_tensor_info(output_names[i], output_tensor, output_tensors); + print_output_tensor_info(result_name, output_tensor, gguf_tensor_addrs); } } auto end_time = ggml_time_us();