This commit is contained in:
Yu, Zijun 2025-07-23 15:37:58 +08:00 committed by Mustafa Cavus
parent 44f4cf34b1
commit 6dc4b90635
4 changed files with 56 additions and 52 deletions

View File

@ -7,7 +7,6 @@ PointerAlignment: Left
Cpp11BracedListStyle: true Cpp11BracedListStyle: true
AccessModifierOffset: -4 AccessModifierOffset: -4
BinPackArguments: false BinPackArguments: false
BinPackParameters: false
BreakBeforeBraces: Attach BreakBeforeBraces: Attach
Language: Cpp Language: Cpp
@ -31,6 +30,7 @@ AllowShortIfStatementsOnASingleLine: Never
AllowShortLambdasOnASingleLine: Inline AllowShortLambdasOnASingleLine: Inline
AllowShortLoopsOnASingleLine: false AllowShortLoopsOnASingleLine: false
AlwaysBreakBeforeMultilineStrings: true AlwaysBreakBeforeMultilineStrings: true
BinPackParameters: true
BitFieldColonSpacing: Both BitFieldColonSpacing: Both
# BreakAdjacentStringLiterals: true # BreakAdjacentStringLiterals: true
BreakAfterAttributes: Never BreakAfterAttributes: Never

View File

@ -31,47 +31,45 @@
GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token, GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token,
int context_size, int num_heads, int num_heads_kv, int head_size) : int context_size, int num_heads, int num_heads_kv, int head_size) :
GgmlOvDecoder::GgmlOvDecoder(node, cgraph, is_static, is_first_token) {
m_context_size = context_size;
m_num_heads = num_heads;
m_num_heads_kv = num_heads_kv;
m_head_size = head_size;
}
GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static,
bool is_first_token) :
m_cgraph(cgraph), m_cgraph(cgraph),
m_node(node), m_node(node),
m_op_name(m_node ? std::string(m_node->name) : "NONE_OP"), m_op_name(std::string(node->name)),
m_context_size(context_size),
m_num_heads(num_heads),
m_num_heads_kv(num_heads_kv),
m_head_size(head_size),
m_is_static(is_static), m_is_static(is_static),
m_is_first_token(is_first_token) { m_is_first_token(is_first_token) {
if (m_node) { set_input_output(node);
set_input_output(m_node); }
} else {
if (is_first_token && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) {
print_tensor_address_map(cgraph);
}
if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph,
auto timestamp = (long long) ggml_time_us(); std::map<std::string, std::shared_ptr<ov::Node>>& model_weights, bool is_static,
std::string filename = "cgraph_" + std::to_string(timestamp) + ".txt"; bool is_first_token) :
dump_cgraph(cgraph, filename); m_cgraph(cgraph),
} m_op_name(m_node ? std::string(m_node->name) : ""),
m_model_weights(model_weights),
set_llm_params(); m_is_static(is_static),
m_is_first_token(is_first_token) {
if (is_first_token) { if (is_first_token && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) {
add_weight_const_parallel(m_model_weights); print_tensor_address_map(cgraph);
}
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
auto* cur_node = cgraph->nodes[node_n];
m_nodes.push_back(cur_node);
set_input_output(cur_node);
}
add_extra_inputs();
} }
if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) {
auto timestamp = (long long) ggml_time_us();
std::string filename = "cgraph_" + std::to_string(timestamp) + ".txt";
dump_cgraph(cgraph, filename);
}
set_llm_params();
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
auto* cur_node = cgraph->nodes[node_n];
m_nodes.push_back(cur_node);
set_input_output(cur_node);
}
add_extra_inputs();
} }
GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph) { GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph) {
@ -334,10 +332,11 @@ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const
return kv_param_res_names; return kv_param_res_names;
} }
void GgmlOvDecoder::add_weight_const_parallel(std::map<std::string, std::shared_ptr<ov::Node>>& model_weights) { std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(struct ggml_cgraph* cgraph) {
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
static std::mutex weights_mutex; static std::mutex weights_mutex;
auto* nodes = m_cgraph->nodes; auto* nodes = cgraph->nodes;
auto n_nodes = m_cgraph->n_nodes; auto n_nodes = cgraph->n_nodes;
std::for_each(std::execution::par, nodes, nodes + n_nodes, [&](ggml_tensor* node) { std::for_each(std::execution::par, nodes, nodes + n_nodes, [&](ggml_tensor* node) {
for (int i = 0; i < GGML_MAX_SRC; i++) { for (int i = 0; i < GGML_MAX_SRC; i++) {
auto* src = node->src[i]; auto* src = node->src[i];
@ -369,6 +368,7 @@ void GgmlOvDecoder::add_weight_const_parallel(std::map<std::string, std::shared_
} }
} }
}); });
return model_weights;
} }
std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) { std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) {

View File

@ -11,12 +11,17 @@
class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
public: public:
GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token); // Graph decoder
GgmlOvDecoder(struct ggml_cgraph* cgraph, std::map<std::string, std::shared_ptr<ov::Node>>& model_weights,
bool is_static, bool is_first_token);
// Node decoder, called in GgmlOvDecoder::visit_subgraph
GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token, GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token,
int context_size, int num_heads, int num_heads_kv, int head_size); int context_size, int num_heads, int num_heads_kv, int head_size);
// Naive decoder // Naive graph decoder
GgmlOvDecoder(struct ggml_cgraph* cgraph); GgmlOvDecoder(struct ggml_cgraph* cgraph);
virtual ov::Any get_attribute(const std::string& name) const override { virtual ov::Any get_attribute(const std::string& name) const override {
return nullptr; return nullptr;
GGML_UNUSED(name); GGML_UNUSED(name);
@ -110,6 +115,8 @@ public:
ov::PartialShape get_graph_input_shape(const ggml_tensor* src) const; ov::PartialShape get_graph_input_shape(const ggml_tensor* src) const;
static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor);
static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(struct ggml_cgraph* cgraph);
void clear_model_weights() { m_model_weights.clear(); } void clear_model_weights() { m_model_weights.clear(); }
private: private:
@ -123,9 +130,6 @@ private:
// set context_size, num_heads, etc // set context_size, num_heads, etc
void set_llm_params(); void set_llm_params();
static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor);
void add_weight_const_parallel(std::map<std::string, std::shared_ptr<ov::Node>>& model_weights);
struct ggml_cgraph* m_cgraph = nullptr; struct ggml_cgraph* m_cgraph = nullptr;
ggml_tensor* m_node = nullptr; ggml_tensor* m_node = nullptr;
std::vector<ggml_tensor*> m_nodes; std::vector<ggml_tensor*> m_nodes;

View File

@ -26,10 +26,6 @@
#include "openvino/frontend.hpp" #include "openvino/frontend.hpp"
#include "openvino/input_model.hpp" #include "openvino/input_model.hpp"
std::shared_ptr<GgmlOvDecoder> get_ggml_decoder(struct ggml_cgraph* cgraph, bool is_static, bool is_first_token) {
return std::make_shared<GgmlOvDecoder>(nullptr, cgraph, is_static, is_first_token);
}
ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string& name) { ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string& name) {
const auto* ggml_tensor = ggml_decoder->get_input_ggml_tensor(name); const auto* ggml_tensor = ggml_decoder->get_input_ggml_tensor(name);
auto* input_data = ggml_tensor->data; auto* input_data = ggml_tensor->data;
@ -111,7 +107,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
auto it = infer_request_cache.find(cgraph); auto it = infer_request_cache.find(cgraph);
if (it != infer_request_cache.end()) { if (it != infer_request_cache.end()) {
ggml_decoder = get_ggml_decoder(cgraph, is_static, false); std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, false);
decoder_end_time = ggml_time_us(); decoder_end_time = ggml_time_us();
// For NPU for the first time we call kvcache modle, pop the compiled kvcache model from cache // For NPU for the first time we call kvcache modle, pop the compiled kvcache model from cache
@ -126,17 +123,20 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
compile_end_time = conversion_end_time; compile_end_time = conversion_end_time;
} else { } else {
std::shared_ptr<ov::Model> model; std::shared_ptr<ov::Model> model;
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
if (is_static) { if (is_static) {
ggml_decoder = get_ggml_decoder(cgraph, is_static, true); ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, true);
auto ggml_decoder_kvcache = get_ggml_decoder(cgraph, is_static, false); auto ggml_decoder_kvcache = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, false);
decoder_end_time = ggml_time_us(); decoder_end_time = ggml_time_us();
auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder); auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);
auto input_model_kvcache = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder_kvcache); auto input_model_kvcache = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder_kvcache);
model = ov::frontend::ggml::FrontEnd::convert(input_model); model = ov::frontend::ggml::FrontEnd::convert(input_model);
ggml_decoder->clear_model_weights();
auto model_kvcache = ov::frontend::ggml::FrontEnd::convert(input_model_kvcache); auto model_kvcache = ov::frontend::ggml::FrontEnd::convert(input_model_kvcache);
ggml_decoder_kvcache->clear_model_weights();
conversion_end_time = ggml_time_us(); conversion_end_time = ggml_time_us();
auto compiled_model = core.compile_model(model, device, config); auto compiled_model = core.compile_model(model, device, config);
@ -157,7 +157,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
ov::serialize(model_kvcache, timestamped_filename); ov::serialize(model_kvcache, timestamped_filename);
} }
} else { } else {
ggml_decoder = get_ggml_decoder(cgraph, is_static, true); ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, true);
decoder_end_time = ggml_time_us(); decoder_end_time = ggml_time_us();
auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder); auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);