Fix NPU
This commit is contained in:
parent
44f4cf34b1
commit
6dc4b90635
|
|
@ -7,7 +7,6 @@ PointerAlignment: Left
|
||||||
Cpp11BracedListStyle: true
|
Cpp11BracedListStyle: true
|
||||||
AccessModifierOffset: -4
|
AccessModifierOffset: -4
|
||||||
BinPackArguments: false
|
BinPackArguments: false
|
||||||
BinPackParameters: false
|
|
||||||
BreakBeforeBraces: Attach
|
BreakBeforeBraces: Attach
|
||||||
|
|
||||||
Language: Cpp
|
Language: Cpp
|
||||||
|
|
@ -31,6 +30,7 @@ AllowShortIfStatementsOnASingleLine: Never
|
||||||
AllowShortLambdasOnASingleLine: Inline
|
AllowShortLambdasOnASingleLine: Inline
|
||||||
AllowShortLoopsOnASingleLine: false
|
AllowShortLoopsOnASingleLine: false
|
||||||
AlwaysBreakBeforeMultilineStrings: true
|
AlwaysBreakBeforeMultilineStrings: true
|
||||||
|
BinPackParameters: true
|
||||||
BitFieldColonSpacing: Both
|
BitFieldColonSpacing: Both
|
||||||
# BreakAdjacentStringLiterals: true
|
# BreakAdjacentStringLiterals: true
|
||||||
BreakAfterAttributes: Never
|
BreakAfterAttributes: Never
|
||||||
|
|
|
||||||
|
|
@ -31,47 +31,45 @@
|
||||||
|
|
||||||
GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token,
|
GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token,
|
||||||
int context_size, int num_heads, int num_heads_kv, int head_size) :
|
int context_size, int num_heads, int num_heads_kv, int head_size) :
|
||||||
GgmlOvDecoder::GgmlOvDecoder(node, cgraph, is_static, is_first_token) {
|
|
||||||
m_context_size = context_size;
|
|
||||||
m_num_heads = num_heads;
|
|
||||||
m_num_heads_kv = num_heads_kv;
|
|
||||||
m_head_size = head_size;
|
|
||||||
}
|
|
||||||
|
|
||||||
GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static,
|
|
||||||
bool is_first_token) :
|
|
||||||
m_cgraph(cgraph),
|
m_cgraph(cgraph),
|
||||||
m_node(node),
|
m_node(node),
|
||||||
m_op_name(m_node ? std::string(m_node->name) : "NONE_OP"),
|
m_op_name(std::string(node->name)),
|
||||||
|
m_context_size(context_size),
|
||||||
|
m_num_heads(num_heads),
|
||||||
|
m_num_heads_kv(num_heads_kv),
|
||||||
|
m_head_size(head_size),
|
||||||
m_is_static(is_static),
|
m_is_static(is_static),
|
||||||
m_is_first_token(is_first_token) {
|
m_is_first_token(is_first_token) {
|
||||||
if (m_node) {
|
set_input_output(node);
|
||||||
set_input_output(m_node);
|
}
|
||||||
} else {
|
|
||||||
if (is_first_token && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) {
|
|
||||||
print_tensor_address_map(cgraph);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) {
|
GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph,
|
||||||
auto timestamp = (long long) ggml_time_us();
|
std::map<std::string, std::shared_ptr<ov::Node>>& model_weights, bool is_static,
|
||||||
std::string filename = "cgraph_" + std::to_string(timestamp) + ".txt";
|
bool is_first_token) :
|
||||||
dump_cgraph(cgraph, filename);
|
m_cgraph(cgraph),
|
||||||
}
|
m_op_name(m_node ? std::string(m_node->name) : ""),
|
||||||
|
m_model_weights(model_weights),
|
||||||
set_llm_params();
|
m_is_static(is_static),
|
||||||
|
m_is_first_token(is_first_token) {
|
||||||
if (is_first_token) {
|
if (is_first_token && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) {
|
||||||
add_weight_const_parallel(m_model_weights);
|
print_tensor_address_map(cgraph);
|
||||||
}
|
|
||||||
|
|
||||||
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
|
|
||||||
auto* cur_node = cgraph->nodes[node_n];
|
|
||||||
m_nodes.push_back(cur_node);
|
|
||||||
set_input_output(cur_node);
|
|
||||||
}
|
|
||||||
|
|
||||||
add_extra_inputs();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) {
|
||||||
|
auto timestamp = (long long) ggml_time_us();
|
||||||
|
std::string filename = "cgraph_" + std::to_string(timestamp) + ".txt";
|
||||||
|
dump_cgraph(cgraph, filename);
|
||||||
|
}
|
||||||
|
|
||||||
|
set_llm_params();
|
||||||
|
|
||||||
|
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
|
||||||
|
auto* cur_node = cgraph->nodes[node_n];
|
||||||
|
m_nodes.push_back(cur_node);
|
||||||
|
set_input_output(cur_node);
|
||||||
|
}
|
||||||
|
|
||||||
|
add_extra_inputs();
|
||||||
}
|
}
|
||||||
|
|
||||||
GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph) {
|
GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph) {
|
||||||
|
|
@ -334,10 +332,11 @@ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const
|
||||||
return kv_param_res_names;
|
return kv_param_res_names;
|
||||||
}
|
}
|
||||||
|
|
||||||
void GgmlOvDecoder::add_weight_const_parallel(std::map<std::string, std::shared_ptr<ov::Node>>& model_weights) {
|
std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(struct ggml_cgraph* cgraph) {
|
||||||
|
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
|
||||||
static std::mutex weights_mutex;
|
static std::mutex weights_mutex;
|
||||||
auto* nodes = m_cgraph->nodes;
|
auto* nodes = cgraph->nodes;
|
||||||
auto n_nodes = m_cgraph->n_nodes;
|
auto n_nodes = cgraph->n_nodes;
|
||||||
std::for_each(std::execution::par, nodes, nodes + n_nodes, [&](ggml_tensor* node) {
|
std::for_each(std::execution::par, nodes, nodes + n_nodes, [&](ggml_tensor* node) {
|
||||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||||
auto* src = node->src[i];
|
auto* src = node->src[i];
|
||||||
|
|
@ -369,6 +368,7 @@ void GgmlOvDecoder::add_weight_const_parallel(std::map<std::string, std::shared_
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
return model_weights;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) {
|
std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) {
|
||||||
|
|
|
||||||
|
|
@ -11,12 +11,17 @@
|
||||||
|
|
||||||
class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
|
class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
|
||||||
public:
|
public:
|
||||||
GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token);
|
// Graph decoder
|
||||||
|
GgmlOvDecoder(struct ggml_cgraph* cgraph, std::map<std::string, std::shared_ptr<ov::Node>>& model_weights,
|
||||||
|
bool is_static, bool is_first_token);
|
||||||
|
|
||||||
|
// Node decoder, called in GgmlOvDecoder::visit_subgraph
|
||||||
GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token,
|
GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token,
|
||||||
int context_size, int num_heads, int num_heads_kv, int head_size);
|
int context_size, int num_heads, int num_heads_kv, int head_size);
|
||||||
|
|
||||||
// Naive decoder
|
// Naive graph decoder
|
||||||
GgmlOvDecoder(struct ggml_cgraph* cgraph);
|
GgmlOvDecoder(struct ggml_cgraph* cgraph);
|
||||||
|
|
||||||
virtual ov::Any get_attribute(const std::string& name) const override {
|
virtual ov::Any get_attribute(const std::string& name) const override {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
GGML_UNUSED(name);
|
GGML_UNUSED(name);
|
||||||
|
|
@ -110,6 +115,8 @@ public:
|
||||||
|
|
||||||
ov::PartialShape get_graph_input_shape(const ggml_tensor* src) const;
|
ov::PartialShape get_graph_input_shape(const ggml_tensor* src) const;
|
||||||
|
|
||||||
|
static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor);
|
||||||
|
static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(struct ggml_cgraph* cgraph);
|
||||||
void clear_model_weights() { m_model_weights.clear(); }
|
void clear_model_weights() { m_model_weights.clear(); }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
@ -123,9 +130,6 @@ private:
|
||||||
// set context_size, num_heads, etc
|
// set context_size, num_heads, etc
|
||||||
void set_llm_params();
|
void set_llm_params();
|
||||||
|
|
||||||
static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor);
|
|
||||||
void add_weight_const_parallel(std::map<std::string, std::shared_ptr<ov::Node>>& model_weights);
|
|
||||||
|
|
||||||
struct ggml_cgraph* m_cgraph = nullptr;
|
struct ggml_cgraph* m_cgraph = nullptr;
|
||||||
ggml_tensor* m_node = nullptr;
|
ggml_tensor* m_node = nullptr;
|
||||||
std::vector<ggml_tensor*> m_nodes;
|
std::vector<ggml_tensor*> m_nodes;
|
||||||
|
|
|
||||||
|
|
@ -26,10 +26,6 @@
|
||||||
#include "openvino/frontend.hpp"
|
#include "openvino/frontend.hpp"
|
||||||
#include "openvino/input_model.hpp"
|
#include "openvino/input_model.hpp"
|
||||||
|
|
||||||
std::shared_ptr<GgmlOvDecoder> get_ggml_decoder(struct ggml_cgraph* cgraph, bool is_static, bool is_first_token) {
|
|
||||||
return std::make_shared<GgmlOvDecoder>(nullptr, cgraph, is_static, is_first_token);
|
|
||||||
}
|
|
||||||
|
|
||||||
ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string& name) {
|
ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string& name) {
|
||||||
const auto* ggml_tensor = ggml_decoder->get_input_ggml_tensor(name);
|
const auto* ggml_tensor = ggml_decoder->get_input_ggml_tensor(name);
|
||||||
auto* input_data = ggml_tensor->data;
|
auto* input_data = ggml_tensor->data;
|
||||||
|
|
@ -111,7 +107,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
|
||||||
|
|
||||||
auto it = infer_request_cache.find(cgraph);
|
auto it = infer_request_cache.find(cgraph);
|
||||||
if (it != infer_request_cache.end()) {
|
if (it != infer_request_cache.end()) {
|
||||||
ggml_decoder = get_ggml_decoder(cgraph, is_static, false);
|
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
|
||||||
|
ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, false);
|
||||||
decoder_end_time = ggml_time_us();
|
decoder_end_time = ggml_time_us();
|
||||||
|
|
||||||
// For NPU for the first time we call kvcache modle, pop the compiled kvcache model from cache
|
// For NPU for the first time we call kvcache modle, pop the compiled kvcache model from cache
|
||||||
|
|
@ -126,17 +123,20 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
|
||||||
compile_end_time = conversion_end_time;
|
compile_end_time = conversion_end_time;
|
||||||
} else {
|
} else {
|
||||||
std::shared_ptr<ov::Model> model;
|
std::shared_ptr<ov::Model> model;
|
||||||
|
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
|
||||||
|
|
||||||
if (is_static) {
|
if (is_static) {
|
||||||
ggml_decoder = get_ggml_decoder(cgraph, is_static, true);
|
ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, true);
|
||||||
auto ggml_decoder_kvcache = get_ggml_decoder(cgraph, is_static, false);
|
auto ggml_decoder_kvcache = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, false);
|
||||||
decoder_end_time = ggml_time_us();
|
decoder_end_time = ggml_time_us();
|
||||||
|
|
||||||
auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);
|
auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);
|
||||||
auto input_model_kvcache = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder_kvcache);
|
auto input_model_kvcache = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder_kvcache);
|
||||||
|
|
||||||
model = ov::frontend::ggml::FrontEnd::convert(input_model);
|
model = ov::frontend::ggml::FrontEnd::convert(input_model);
|
||||||
|
ggml_decoder->clear_model_weights();
|
||||||
auto model_kvcache = ov::frontend::ggml::FrontEnd::convert(input_model_kvcache);
|
auto model_kvcache = ov::frontend::ggml::FrontEnd::convert(input_model_kvcache);
|
||||||
|
ggml_decoder_kvcache->clear_model_weights();
|
||||||
conversion_end_time = ggml_time_us();
|
conversion_end_time = ggml_time_us();
|
||||||
|
|
||||||
auto compiled_model = core.compile_model(model, device, config);
|
auto compiled_model = core.compile_model(model, device, config);
|
||||||
|
|
@ -157,7 +157,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
|
||||||
ov::serialize(model_kvcache, timestamped_filename);
|
ov::serialize(model_kvcache, timestamped_filename);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
ggml_decoder = get_ggml_decoder(cgraph, is_static, true);
|
ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, true);
|
||||||
decoder_end_time = ggml_time_us();
|
decoder_end_time = ggml_time_us();
|
||||||
|
|
||||||
auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);
|
auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue