This commit is contained in:
Yu, Zijun 2025-07-23 15:37:58 +08:00 committed by Mustafa Cavus
parent 44f4cf34b1
commit 6dc4b90635
4 changed files with 56 additions and 52 deletions

View File

@ -7,7 +7,6 @@ PointerAlignment: Left
Cpp11BracedListStyle: true
AccessModifierOffset: -4
BinPackArguments: false
BinPackParameters: false
BreakBeforeBraces: Attach
Language: Cpp
@ -31,6 +30,7 @@ AllowShortIfStatementsOnASingleLine: Never
AllowShortLambdasOnASingleLine: Inline
AllowShortLoopsOnASingleLine: false
AlwaysBreakBeforeMultilineStrings: true
BinPackParameters: true
BitFieldColonSpacing: Both
# BreakAdjacentStringLiterals: true
BreakAfterAttributes: Never

View File

@ -31,47 +31,45 @@
GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token,
int context_size, int num_heads, int num_heads_kv, int head_size) :
GgmlOvDecoder::GgmlOvDecoder(node, cgraph, is_static, is_first_token) {
m_context_size = context_size;
m_num_heads = num_heads;
m_num_heads_kv = num_heads_kv;
m_head_size = head_size;
}
GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static,
bool is_first_token) :
m_cgraph(cgraph),
m_node(node),
m_op_name(m_node ? std::string(m_node->name) : "NONE_OP"),
m_op_name(std::string(node->name)),
m_context_size(context_size),
m_num_heads(num_heads),
m_num_heads_kv(num_heads_kv),
m_head_size(head_size),
m_is_static(is_static),
m_is_first_token(is_first_token) {
if (m_node) {
set_input_output(m_node);
} else {
if (is_first_token && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) {
print_tensor_address_map(cgraph);
}
set_input_output(node);
}
if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) {
auto timestamp = (long long) ggml_time_us();
std::string filename = "cgraph_" + std::to_string(timestamp) + ".txt";
dump_cgraph(cgraph, filename);
}
set_llm_params();
if (is_first_token) {
add_weight_const_parallel(m_model_weights);
}
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
auto* cur_node = cgraph->nodes[node_n];
m_nodes.push_back(cur_node);
set_input_output(cur_node);
}
add_extra_inputs();
GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph,
std::map<std::string, std::shared_ptr<ov::Node>>& model_weights, bool is_static,
bool is_first_token) :
m_cgraph(cgraph),
m_op_name(m_node ? std::string(m_node->name) : ""),
m_model_weights(model_weights),
m_is_static(is_static),
m_is_first_token(is_first_token) {
if (is_first_token && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) {
print_tensor_address_map(cgraph);
}
if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) {
auto timestamp = (long long) ggml_time_us();
std::string filename = "cgraph_" + std::to_string(timestamp) + ".txt";
dump_cgraph(cgraph, filename);
}
set_llm_params();
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
auto* cur_node = cgraph->nodes[node_n];
m_nodes.push_back(cur_node);
set_input_output(cur_node);
}
add_extra_inputs();
}
GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph) {
@ -334,10 +332,11 @@ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const
return kv_param_res_names;
}
void GgmlOvDecoder::add_weight_const_parallel(std::map<std::string, std::shared_ptr<ov::Node>>& model_weights) {
std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(struct ggml_cgraph* cgraph) {
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
static std::mutex weights_mutex;
auto* nodes = m_cgraph->nodes;
auto n_nodes = m_cgraph->n_nodes;
auto* nodes = cgraph->nodes;
auto n_nodes = cgraph->n_nodes;
std::for_each(std::execution::par, nodes, nodes + n_nodes, [&](ggml_tensor* node) {
for (int i = 0; i < GGML_MAX_SRC; i++) {
auto* src = node->src[i];
@ -369,6 +368,7 @@ void GgmlOvDecoder::add_weight_const_parallel(std::map<std::string, std::shared_
}
}
});
return model_weights;
}
std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) {

View File

@ -11,12 +11,17 @@
class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
public:
GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token);
// Graph decoder
GgmlOvDecoder(struct ggml_cgraph* cgraph, std::map<std::string, std::shared_ptr<ov::Node>>& model_weights,
bool is_static, bool is_first_token);
// Node decoder, called in GgmlOvDecoder::visit_subgraph
GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token,
int context_size, int num_heads, int num_heads_kv, int head_size);
// Naive decoder
// Naive graph decoder
GgmlOvDecoder(struct ggml_cgraph* cgraph);
virtual ov::Any get_attribute(const std::string& name) const override {
return nullptr;
GGML_UNUSED(name);
@ -110,6 +115,8 @@ public:
ov::PartialShape get_graph_input_shape(const ggml_tensor* src) const;
static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor);
static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(struct ggml_cgraph* cgraph);
void clear_model_weights() { m_model_weights.clear(); }
private:
@ -123,9 +130,6 @@ private:
// set context_size, num_heads, etc
void set_llm_params();
static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor);
void add_weight_const_parallel(std::map<std::string, std::shared_ptr<ov::Node>>& model_weights);
struct ggml_cgraph* m_cgraph = nullptr;
ggml_tensor* m_node = nullptr;
std::vector<ggml_tensor*> m_nodes;

View File

@ -26,10 +26,6 @@
#include "openvino/frontend.hpp"
#include "openvino/input_model.hpp"
std::shared_ptr<GgmlOvDecoder> get_ggml_decoder(struct ggml_cgraph* cgraph, bool is_static, bool is_first_token) {
return std::make_shared<GgmlOvDecoder>(nullptr, cgraph, is_static, is_first_token);
}
ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string& name) {
const auto* ggml_tensor = ggml_decoder->get_input_ggml_tensor(name);
auto* input_data = ggml_tensor->data;
@ -111,7 +107,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
auto it = infer_request_cache.find(cgraph);
if (it != infer_request_cache.end()) {
ggml_decoder = get_ggml_decoder(cgraph, is_static, false);
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, false);
decoder_end_time = ggml_time_us();
// For NPU for the first time we call kvcache modle, pop the compiled kvcache model from cache
@ -126,17 +123,20 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
compile_end_time = conversion_end_time;
} else {
std::shared_ptr<ov::Model> model;
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
if (is_static) {
ggml_decoder = get_ggml_decoder(cgraph, is_static, true);
auto ggml_decoder_kvcache = get_ggml_decoder(cgraph, is_static, false);
ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, true);
auto ggml_decoder_kvcache = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, false);
decoder_end_time = ggml_time_us();
auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);
auto input_model_kvcache = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder_kvcache);
model = ov::frontend::ggml::FrontEnd::convert(input_model);
ggml_decoder->clear_model_weights();
auto model_kvcache = ov::frontend::ggml::FrontEnd::convert(input_model_kvcache);
ggml_decoder_kvcache->clear_model_weights();
conversion_end_time = ggml_time_us();
auto compiled_model = core.compile_model(model, device, config);
@ -157,7 +157,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
ov::serialize(model_kvcache, timestamped_filename);
}
} else {
ggml_decoder = get_ggml_decoder(cgraph, is_static, true);
ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, true);
decoder_end_time = ggml_time_us();
auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);