Fix NPU
This commit is contained in:
parent
44f4cf34b1
commit
6dc4b90635
|
|
@ -7,7 +7,6 @@ PointerAlignment: Left
|
|||
Cpp11BracedListStyle: true
|
||||
AccessModifierOffset: -4
|
||||
BinPackArguments: false
|
||||
BinPackParameters: false
|
||||
BreakBeforeBraces: Attach
|
||||
|
||||
Language: Cpp
|
||||
|
|
@ -31,6 +30,7 @@ AllowShortIfStatementsOnASingleLine: Never
|
|||
AllowShortLambdasOnASingleLine: Inline
|
||||
AllowShortLoopsOnASingleLine: false
|
||||
AlwaysBreakBeforeMultilineStrings: true
|
||||
BinPackParameters: true
|
||||
BitFieldColonSpacing: Both
|
||||
# BreakAdjacentStringLiterals: true
|
||||
BreakAfterAttributes: Never
|
||||
|
|
|
|||
|
|
@ -31,47 +31,45 @@
|
|||
|
||||
GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token,
|
||||
int context_size, int num_heads, int num_heads_kv, int head_size) :
|
||||
GgmlOvDecoder::GgmlOvDecoder(node, cgraph, is_static, is_first_token) {
|
||||
m_context_size = context_size;
|
||||
m_num_heads = num_heads;
|
||||
m_num_heads_kv = num_heads_kv;
|
||||
m_head_size = head_size;
|
||||
}
|
||||
|
||||
GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static,
|
||||
bool is_first_token) :
|
||||
m_cgraph(cgraph),
|
||||
m_node(node),
|
||||
m_op_name(m_node ? std::string(m_node->name) : "NONE_OP"),
|
||||
m_op_name(std::string(node->name)),
|
||||
m_context_size(context_size),
|
||||
m_num_heads(num_heads),
|
||||
m_num_heads_kv(num_heads_kv),
|
||||
m_head_size(head_size),
|
||||
m_is_static(is_static),
|
||||
m_is_first_token(is_first_token) {
|
||||
if (m_node) {
|
||||
set_input_output(m_node);
|
||||
} else {
|
||||
if (is_first_token && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) {
|
||||
print_tensor_address_map(cgraph);
|
||||
}
|
||||
set_input_output(node);
|
||||
}
|
||||
|
||||
if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) {
|
||||
auto timestamp = (long long) ggml_time_us();
|
||||
std::string filename = "cgraph_" + std::to_string(timestamp) + ".txt";
|
||||
dump_cgraph(cgraph, filename);
|
||||
}
|
||||
|
||||
set_llm_params();
|
||||
|
||||
if (is_first_token) {
|
||||
add_weight_const_parallel(m_model_weights);
|
||||
}
|
||||
|
||||
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
|
||||
auto* cur_node = cgraph->nodes[node_n];
|
||||
m_nodes.push_back(cur_node);
|
||||
set_input_output(cur_node);
|
||||
}
|
||||
|
||||
add_extra_inputs();
|
||||
GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph,
|
||||
std::map<std::string, std::shared_ptr<ov::Node>>& model_weights, bool is_static,
|
||||
bool is_first_token) :
|
||||
m_cgraph(cgraph),
|
||||
m_op_name(m_node ? std::string(m_node->name) : ""),
|
||||
m_model_weights(model_weights),
|
||||
m_is_static(is_static),
|
||||
m_is_first_token(is_first_token) {
|
||||
if (is_first_token && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) {
|
||||
print_tensor_address_map(cgraph);
|
||||
}
|
||||
|
||||
if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) {
|
||||
auto timestamp = (long long) ggml_time_us();
|
||||
std::string filename = "cgraph_" + std::to_string(timestamp) + ".txt";
|
||||
dump_cgraph(cgraph, filename);
|
||||
}
|
||||
|
||||
set_llm_params();
|
||||
|
||||
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
|
||||
auto* cur_node = cgraph->nodes[node_n];
|
||||
m_nodes.push_back(cur_node);
|
||||
set_input_output(cur_node);
|
||||
}
|
||||
|
||||
add_extra_inputs();
|
||||
}
|
||||
|
||||
GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph) {
|
||||
|
|
@ -334,10 +332,11 @@ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const
|
|||
return kv_param_res_names;
|
||||
}
|
||||
|
||||
void GgmlOvDecoder::add_weight_const_parallel(std::map<std::string, std::shared_ptr<ov::Node>>& model_weights) {
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(struct ggml_cgraph* cgraph) {
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
|
||||
static std::mutex weights_mutex;
|
||||
auto* nodes = m_cgraph->nodes;
|
||||
auto n_nodes = m_cgraph->n_nodes;
|
||||
auto* nodes = cgraph->nodes;
|
||||
auto n_nodes = cgraph->n_nodes;
|
||||
std::for_each(std::execution::par, nodes, nodes + n_nodes, [&](ggml_tensor* node) {
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
auto* src = node->src[i];
|
||||
|
|
@ -369,6 +368,7 @@ void GgmlOvDecoder::add_weight_const_parallel(std::map<std::string, std::shared_
|
|||
}
|
||||
}
|
||||
});
|
||||
return model_weights;
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) {
|
||||
|
|
|
|||
|
|
@ -11,12 +11,17 @@
|
|||
|
||||
class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
|
||||
public:
|
||||
GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token);
|
||||
// Graph decoder
|
||||
GgmlOvDecoder(struct ggml_cgraph* cgraph, std::map<std::string, std::shared_ptr<ov::Node>>& model_weights,
|
||||
bool is_static, bool is_first_token);
|
||||
|
||||
// Node decoder, called in GgmlOvDecoder::visit_subgraph
|
||||
GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token,
|
||||
int context_size, int num_heads, int num_heads_kv, int head_size);
|
||||
|
||||
// Naive decoder
|
||||
// Naive graph decoder
|
||||
GgmlOvDecoder(struct ggml_cgraph* cgraph);
|
||||
|
||||
virtual ov::Any get_attribute(const std::string& name) const override {
|
||||
return nullptr;
|
||||
GGML_UNUSED(name);
|
||||
|
|
@ -110,6 +115,8 @@ public:
|
|||
|
||||
ov::PartialShape get_graph_input_shape(const ggml_tensor* src) const;
|
||||
|
||||
static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor);
|
||||
static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(struct ggml_cgraph* cgraph);
|
||||
void clear_model_weights() { m_model_weights.clear(); }
|
||||
|
||||
private:
|
||||
|
|
@ -123,9 +130,6 @@ private:
|
|||
// set context_size, num_heads, etc
|
||||
void set_llm_params();
|
||||
|
||||
static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor);
|
||||
void add_weight_const_parallel(std::map<std::string, std::shared_ptr<ov::Node>>& model_weights);
|
||||
|
||||
struct ggml_cgraph* m_cgraph = nullptr;
|
||||
ggml_tensor* m_node = nullptr;
|
||||
std::vector<ggml_tensor*> m_nodes;
|
||||
|
|
|
|||
|
|
@ -26,10 +26,6 @@
|
|||
#include "openvino/frontend.hpp"
|
||||
#include "openvino/input_model.hpp"
|
||||
|
||||
std::shared_ptr<GgmlOvDecoder> get_ggml_decoder(struct ggml_cgraph* cgraph, bool is_static, bool is_first_token) {
|
||||
return std::make_shared<GgmlOvDecoder>(nullptr, cgraph, is_static, is_first_token);
|
||||
}
|
||||
|
||||
ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string& name) {
|
||||
const auto* ggml_tensor = ggml_decoder->get_input_ggml_tensor(name);
|
||||
auto* input_data = ggml_tensor->data;
|
||||
|
|
@ -111,7 +107,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
|
|||
|
||||
auto it = infer_request_cache.find(cgraph);
|
||||
if (it != infer_request_cache.end()) {
|
||||
ggml_decoder = get_ggml_decoder(cgraph, is_static, false);
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
|
||||
ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, false);
|
||||
decoder_end_time = ggml_time_us();
|
||||
|
||||
// For NPU for the first time we call kvcache modle, pop the compiled kvcache model from cache
|
||||
|
|
@ -126,17 +123,20 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
|
|||
compile_end_time = conversion_end_time;
|
||||
} else {
|
||||
std::shared_ptr<ov::Model> model;
|
||||
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
|
||||
|
||||
if (is_static) {
|
||||
ggml_decoder = get_ggml_decoder(cgraph, is_static, true);
|
||||
auto ggml_decoder_kvcache = get_ggml_decoder(cgraph, is_static, false);
|
||||
ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, true);
|
||||
auto ggml_decoder_kvcache = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, false);
|
||||
decoder_end_time = ggml_time_us();
|
||||
|
||||
auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);
|
||||
auto input_model_kvcache = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder_kvcache);
|
||||
|
||||
model = ov::frontend::ggml::FrontEnd::convert(input_model);
|
||||
ggml_decoder->clear_model_weights();
|
||||
auto model_kvcache = ov::frontend::ggml::FrontEnd::convert(input_model_kvcache);
|
||||
ggml_decoder_kvcache->clear_model_weights();
|
||||
conversion_end_time = ggml_time_us();
|
||||
|
||||
auto compiled_model = core.compile_model(model, device, config);
|
||||
|
|
@ -157,7 +157,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
|
|||
ov::serialize(model_kvcache, timestamped_filename);
|
||||
}
|
||||
} else {
|
||||
ggml_decoder = get_ggml_decoder(cgraph, is_static, true);
|
||||
ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, true);
|
||||
decoder_end_time = ggml_time_us();
|
||||
|
||||
auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);
|
||||
|
|
|
|||
Loading…
Reference in New Issue