Add initial NPU support
This commit is contained in:
parent
3051d5ae07
commit
7fec223334
|
|
@ -14,6 +14,7 @@
|
|||
#include <memory>
|
||||
#include <openvino/core/dimension.hpp>
|
||||
#include <openvino/core/node.hpp>
|
||||
#include <openvino/core/partial_shape.hpp>
|
||||
#include <openvino/core/type/float16.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/parameter.hpp>
|
||||
|
|
@ -25,14 +26,16 @@
|
|||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph)
|
||||
GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token)
|
||||
: m_cgraph(cgraph),
|
||||
m_node(node),
|
||||
m_op_name(m_node ? std::string(m_node->name) : "NONE_OP") {
|
||||
m_op_name(m_node ? std::string(m_node->name) : "NONE_OP"),
|
||||
m_is_static(is_static),
|
||||
m_is_first_token(is_first_token) {
|
||||
static std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
|
||||
|
||||
if (m_node) {
|
||||
set_input_output(m_node, model_weights);
|
||||
set_input_output(m_node);
|
||||
} else {
|
||||
static bool printed = false;
|
||||
if (!printed && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) {
|
||||
|
|
@ -47,7 +50,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap
|
|||
set_max_token_len();
|
||||
|
||||
static bool weight_created = false;
|
||||
if (!getenv("GGML_OPENVINO_WEIGHT_AS_INPUT") && !weight_created) {
|
||||
if (!weight_created) {
|
||||
add_weight_const_parallel(model_weights);
|
||||
weight_created = true;
|
||||
}
|
||||
|
|
@ -55,7 +58,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap
|
|||
for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) {
|
||||
auto* cur_node = m_cgraph->nodes[node_n];
|
||||
m_nodes.push_back(cur_node);
|
||||
set_input_output(cur_node, model_weights);
|
||||
set_input_output(cur_node);
|
||||
}
|
||||
m_model_weights = model_weights;
|
||||
|
||||
|
|
@ -65,8 +68,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap
|
|||
|
||||
// Called in GgmlOvDecoder constructor. Two cases: 1. constructing a decoder for the whole graph;
|
||||
// 2. constructing a decoder for a node.
|
||||
void GgmlOvDecoder::set_input_output(ggml_tensor* node,
|
||||
std::map<std::string, std::shared_ptr<ov::Node>>& model_weights) {
|
||||
void GgmlOvDecoder::set_input_output(ggml_tensor* node) {
|
||||
std::string node_name;
|
||||
if (node->op == GGML_OP_CPY) {
|
||||
// CPY updates the input tensor in place. For later ov op that uses the
|
||||
|
|
@ -95,21 +97,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node,
|
|||
if (!m_node && !src->view_src) {
|
||||
ggml_backend_buffer* buffer = src->buffer;
|
||||
|
||||
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
||||
bool weight_as_input = getenv("GGML_OPENVINO_WEIGHT_AS_INPUT");
|
||||
auto& weights_map = weight_as_input ? m_model_inputs : model_weights;
|
||||
if (weights_map.find(src_name) != weights_map.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Node> weight_node =
|
||||
weight_as_input
|
||||
? std::make_shared<ov::op::v0::Parameter>(get_ov_type(src), ov::Shape{get_shape(src)})
|
||||
: create_weight_node(src);
|
||||
weight_node->set_friendly_name(src_name);
|
||||
weights_map[src_name] = weight_node;
|
||||
|
||||
} else if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) {
|
||||
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) {
|
||||
// GGML_BACKEND_BUFFER_USAGE_ANY are kv caches
|
||||
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) {
|
||||
assert(src_name.find("cache_k") == 0 || src_name.find("cache_v") == 0);
|
||||
|
|
@ -119,10 +107,24 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node,
|
|||
}
|
||||
ov::PartialShape input_shape;
|
||||
if (std::string(src->name) == "inp_tokens" || std::string(src->name) == "inp_pos") {
|
||||
input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_max_token_len)};
|
||||
if (m_is_static) {
|
||||
input_shape = ov::PartialShape(get_shape(src));
|
||||
// if (m_is_first_token) {
|
||||
// input_shape = ov::PartialShape{1, 1, m_max_token_len};
|
||||
// } else {
|
||||
// input_shape = ov::PartialShape{1, 1, 1};
|
||||
// }
|
||||
} else {
|
||||
input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_max_token_len)};
|
||||
}
|
||||
} else if (std::string(src->name).find("KQ_mask") == 0) {
|
||||
auto max_token_len = GGML_PAD(m_max_token_len, GGML_KQ_MASK_PAD);
|
||||
input_shape = ov::PartialShape{1, ov::Dimension(1, max_token_len), ov::Dimension(1, max_token_len)};
|
||||
if (m_is_static) {
|
||||
input_shape = ov::PartialShape(get_shape(src));
|
||||
} else {
|
||||
auto max_token_len = GGML_PAD(m_max_token_len, GGML_KQ_MASK_PAD);
|
||||
input_shape =
|
||||
ov::PartialShape{1, ov::Dimension(1, max_token_len), ov::Dimension(1, max_token_len)};
|
||||
}
|
||||
} else {
|
||||
input_shape = ov::Shape{get_shape(src)};
|
||||
}
|
||||
|
|
@ -510,7 +512,7 @@ int32_t* GgmlOvDecoder::get_output_op_params(const std::string& name) const {
|
|||
|
||||
void GgmlOvDecoder::visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>)> node_visitor) const {
|
||||
for (const auto& node : m_nodes) {
|
||||
auto decoder = std::make_shared<GgmlOvDecoder>(node, m_cgraph);
|
||||
auto decoder = std::make_shared<GgmlOvDecoder>(node, m_cgraph, m_is_static, m_is_first_token);
|
||||
node_visitor(decoder);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
|
|||
public:
|
||||
using ov::frontend::ggml::GgmlDecoder::GgmlDecoder;
|
||||
|
||||
GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph);
|
||||
GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token);
|
||||
|
||||
virtual ov::Any get_attribute(const std::string& name) const override {
|
||||
return nullptr;
|
||||
|
|
@ -89,8 +89,15 @@ public:
|
|||
return m_model_output_names;
|
||||
}
|
||||
|
||||
virtual bool is_static() const override {
|
||||
return m_is_static;
|
||||
}
|
||||
virtual bool is_first_token() const {
|
||||
return m_is_first_token;
|
||||
}
|
||||
|
||||
private:
|
||||
void set_input_output(ggml_tensor* node, std::map<std::string, std::shared_ptr<ov::Node>>& model_weights);
|
||||
void set_input_output(ggml_tensor* node);
|
||||
void add_extra_inputs();
|
||||
static void dump_cgraph(const struct ggml_cgraph* cgraph);
|
||||
static std::vector<size_t> get_shape(const ggml_tensor* tensor);
|
||||
|
|
@ -119,6 +126,8 @@ private:
|
|||
std::map<std::string, std::shared_ptr<ov::Tensor>> m_model_extra_input_values;
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> m_model_weights;
|
||||
std::vector<std::string> m_model_output_names;
|
||||
bool m_is_static;
|
||||
bool m_is_first_token;
|
||||
};
|
||||
|
||||
void print_tensor_address_map(const struct ggml_cgraph* cgraph);
|
||||
|
|
|
|||
|
|
@ -55,6 +55,8 @@ public:
|
|||
virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_extra_inputs() const = 0;
|
||||
virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_weights() const = 0;
|
||||
virtual const std::vector<std::string>& get_model_output_names() const = 0;
|
||||
|
||||
virtual bool is_static() const = 0;
|
||||
};
|
||||
|
||||
} // namespace ggml
|
||||
|
|
|
|||
|
|
@ -84,6 +84,9 @@ public:
|
|||
int get_op_case() const {
|
||||
return m_decoder->get_op_case();
|
||||
}
|
||||
bool is_static() const {
|
||||
return m_decoder->is_static();
|
||||
}
|
||||
|
||||
private:
|
||||
std::shared_ptr<GgmlDecoder> m_decoder;
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@
|
|||
#include <openvino/core/node_output.hpp>
|
||||
#include <openvino/core/node_vector.hpp>
|
||||
#include <openvino/op/add.hpp>
|
||||
#include <openvino/op/broadcast.hpp>
|
||||
#include <openvino/op/concat.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/convert_like.hpp>
|
||||
|
|
@ -12,6 +13,7 @@
|
|||
#include <openvino/op/reshape.hpp>
|
||||
#include <openvino/op/scatter_nd_update.hpp>
|
||||
#include <openvino/op/slice.hpp>
|
||||
#include <openvino/op/squeeze.hpp>
|
||||
#include <openvino/op/transpose.hpp>
|
||||
#include <openvino/op/unsqueeze.hpp>
|
||||
#include <vector>
|
||||
|
|
@ -57,6 +59,13 @@ OutputVector translate_cpy(const NodeContext& context) {
|
|||
token_len = std::make_shared<ov::op::v1::Reshape>(token_len,
|
||||
ov::op::v0::Constant::create(ov::element::i64, {0}, {}),
|
||||
false);
|
||||
|
||||
if (context.is_static()) {
|
||||
int32_t* op_params = context.get_input_op_params(1);
|
||||
int64_t past_token_len_val = op_params[0] / context.get_input_stride(1)[2] / num_heads / head_size;
|
||||
past_token_len = ov::op::v0::Constant::create(ov::element::i64, {}, {past_token_len_val});
|
||||
}
|
||||
|
||||
auto total_token_len = std::make_shared<ov::op::v1::Add>(past_token_len, token_len);
|
||||
std::shared_ptr<ov::Node> indices =
|
||||
std::make_shared<ov::op::v4::Range>(past_token_len, total_token_len, one, ov::element::i64);
|
||||
|
|
@ -67,39 +76,88 @@ OutputVector translate_cpy(const NodeContext& context) {
|
|||
res = std::make_shared<ov::op::v3::ScatterNDUpdate>(reshaped_src1, indices, src0);
|
||||
} else {
|
||||
// Write V to cache_v
|
||||
int64_t total_head_size = src0_shape[1];
|
||||
auto total_head_size_node = ov::op::v0::Constant::create(ov::element::i64, {1}, {total_head_size});
|
||||
|
||||
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
|
||||
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
|
||||
auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
|
||||
|
||||
auto zero_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {0});
|
||||
auto one_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {1});
|
||||
|
||||
int64_t total_head_size = src0_shape[1];
|
||||
auto total_head_size_node = ov::op::v0::Constant::create(ov::element::i64, {1}, {total_head_size});
|
||||
auto total_head_size_scalar = std::make_shared<ov::op::v0::Squeeze>(total_head_size_node, zero);
|
||||
|
||||
auto token_len = get_dimensions(src0.get_node_shared_ptr(), {2});
|
||||
past_token_len = std::make_shared<ov::op::v0::Unsqueeze>(past_token_len, zero);
|
||||
auto total_token_len = std::make_shared<ov::op::v1::Add>(past_token_len, token_len);
|
||||
auto token_len_scalar = std::make_shared<ov::op::v0::Squeeze>(token_len, zero);
|
||||
if (context.is_static()) {
|
||||
int32_t* op_params = context.get_input_op_params(1);
|
||||
int64_t past_token_len_val = op_params[0] / context.get_input_stride(1)[2];
|
||||
past_token_len = ov::op::v0::Constant::create(ov::element::i64, {}, {past_token_len_val});
|
||||
}
|
||||
auto total_token_len_scalar = std::make_shared<ov::op::v1::Add>(past_token_len, token_len_scalar);
|
||||
|
||||
// auto reshaped_src1 = std::make_shared<ov::op::v1::Reshape>(
|
||||
// src1,
|
||||
// ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{1, total_head_size, -1}),
|
||||
// false);
|
||||
|
||||
// auto src1_left = std::make_shared<ov::op::v8::Slice>(
|
||||
// reshaped_src1,
|
||||
// ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, 0}),
|
||||
// std::make_shared<ov::op::v0::Concat>(ov::OutputVector{one, total_head_size_node, past_token_len}, 0),
|
||||
// ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1}));
|
||||
|
||||
// auto src1_right = std::make_shared<ov::op::v8::Slice>(
|
||||
// reshaped_src1,
|
||||
// std::make_shared<ov::op::v0::Concat>(ov::OutputVector{zero, zero, total_token_len}, 0),
|
||||
// ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{1, total_head_size, INT_MAX}),
|
||||
// ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1}));
|
||||
|
||||
// auto reshaped_src0 = std::make_shared<ov::op::v1::Reshape>(
|
||||
// src0,
|
||||
// ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{1, total_head_size, -1}),
|
||||
// false);
|
||||
|
||||
// auto res = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{src1_left, reshaped_src0, src1_right}, 2);
|
||||
|
||||
// 1D tensor of shape [total_head_size], values starting from 0
|
||||
auto range_row =
|
||||
std::make_shared<ov::op::v4::Range>(zero_scalar, total_head_size_scalar, one_scalar, ov::element::i64);
|
||||
auto range_row_reshaped =
|
||||
std::make_shared<ov::op::v0::Unsqueeze>(range_row,
|
||||
ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 2}));
|
||||
auto row_indices = std::make_shared<ov::op::v3::Broadcast>(
|
||||
range_row_reshaped,
|
||||
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{total_head_size_node, token_len, one}, 0));
|
||||
|
||||
// 1D tensor of shape [token_len], values starting from past_token_len
|
||||
auto range_col =
|
||||
std::make_shared<ov::op::v4::Range>(past_token_len, total_token_len_scalar, one_scalar, element::i64);
|
||||
auto range_col_reshaped =
|
||||
std::make_shared<ov::op::v0::Unsqueeze>(range_col,
|
||||
ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 2}));
|
||||
auto col_indices = std::make_shared<ov::op::v3::Broadcast>(
|
||||
range_col_reshaped,
|
||||
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{total_head_size_node, token_len, one}, 0));
|
||||
|
||||
// Stack row_indices and col_indices along last axis: [total_head_size, token_len, 2]
|
||||
auto indices = std::make_shared<ov::op::v0::Concat>(OutputVector{row_indices, col_indices}, 2);
|
||||
auto indices_final = std::make_shared<ov::op::v1::Reshape>(
|
||||
indices,
|
||||
ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector<int64_t>{-1, 2}),
|
||||
false);
|
||||
|
||||
auto flattend_src0 =
|
||||
std::make_shared<ov::op::v1::Reshape>(src0,
|
||||
ov::op::v0::Constant::create(element::i64, Shape{1}, {-1}),
|
||||
false);
|
||||
auto reshaped_src1 = std::make_shared<ov::op::v1::Reshape>(
|
||||
src1,
|
||||
ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{1, total_head_size, -1}),
|
||||
ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector<int64_t>{total_head_size, -1}),
|
||||
false);
|
||||
|
||||
auto src1_left = std::make_shared<ov::op::v8::Slice>(
|
||||
reshaped_src1,
|
||||
ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, 0}),
|
||||
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{one, total_head_size_node, past_token_len}, 0),
|
||||
ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1}));
|
||||
|
||||
auto src1_right = std::make_shared<ov::op::v8::Slice>(
|
||||
reshaped_src1,
|
||||
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{zero, zero, total_token_len}, 0),
|
||||
ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{1, total_head_size, INT_MAX}),
|
||||
ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1}));
|
||||
|
||||
auto reshaped_src0 = std::make_shared<ov::op::v1::Reshape>(
|
||||
src0,
|
||||
ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{1, total_head_size, -1}),
|
||||
false);
|
||||
|
||||
res = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{src1_left, reshaped_src0, src1_right}, 2);
|
||||
auto updated = std::make_shared<ov::op::v3::ScatterNDUpdate>(reshaped_src1, indices_final, flattend_src0);
|
||||
res = std::make_shared<ov::op::v0::Unsqueeze>(updated, zero);
|
||||
}
|
||||
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
|
|
|
|||
|
|
@ -55,17 +55,21 @@ OutputVector translate_mulmat(const NodeContext& context) {
|
|||
ov::Output<ov::Node> A;
|
||||
ov::Output<ov::Node> B;
|
||||
|
||||
auto attention_size = context.get_input("attention_size");
|
||||
|
||||
auto src0 = context.get_input(0);
|
||||
auto src0_shape = context.get_input_shape(0).to_shape();
|
||||
auto src0_stride = context.get_input_stride(0);
|
||||
auto permuted = is_permuted(src0_stride);
|
||||
auto token_dim = permuted ? 0 : 2;
|
||||
|
||||
auto attention_size = context.get_input("attention_size");
|
||||
|
||||
auto src0_perm = argsort_descend(src0_stride);
|
||||
auto src0_original_shape_ = permute(src0_shape, src0_perm);
|
||||
std::vector<int64_t> src0_original_shape(src0_original_shape_.begin(), src0_original_shape_.end());
|
||||
|
||||
if (context.is_static()) {
|
||||
attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {src0_original_shape[token_dim]});
|
||||
}
|
||||
src0_original_shape[token_dim] = -1;
|
||||
|
||||
auto src0_slice_shape = src0_original_shape;
|
||||
|
|
|
|||
|
|
@ -1,8 +1,9 @@
|
|||
#include <memory>
|
||||
#include <openvino/op/add.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/divide.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <openvino/op/reduce_sum.hpp>
|
||||
#include <openvino/op/reduce_mean.hpp>
|
||||
#include <openvino/op/sqrt.hpp>
|
||||
|
||||
#include "../node_context.hpp"
|
||||
|
|
@ -16,28 +17,24 @@ namespace op {
|
|||
OutputVector translate_rms_norm(const NodeContext& context) {
|
||||
num_inputs_check(context, 1, 1);
|
||||
|
||||
ov::Shape input_shape = context.get_input_shape(0).to_shape();
|
||||
auto input_node = context.get_input(0);
|
||||
auto square = std::make_shared<ov::op::v1::Multiply>(input_node, input_node);
|
||||
|
||||
auto reduce_sum =
|
||||
std::make_shared<ov::op::v1::ReduceSum>(square,
|
||||
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}),
|
||||
true);
|
||||
|
||||
auto mean = std::make_shared<ov::op::v1::Divide>(
|
||||
reduce_sum,
|
||||
ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {static_cast<float>(input_shape[2])}));
|
||||
auto mean =
|
||||
std::make_shared<ov::op::v1::ReduceMean>(square,
|
||||
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}),
|
||||
true);
|
||||
|
||||
float eps;
|
||||
memcpy(&eps, context.get_output_op_params(0), sizeof(float));
|
||||
|
||||
auto rms = std::make_shared<ov::op::v0::Sqrt>(
|
||||
std::make_shared<ov::op::v1::Add>(mean, ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {eps})));
|
||||
|
||||
auto scale =
|
||||
std::make_shared<ov::op::v1::Divide>(ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {1.0f}), rms);
|
||||
auto reciprocal =
|
||||
std::make_shared<ov::op::v1::Divide>(ov::op::v0::Constant::create(ov::element::f32, ov::Shape{1}, {1.0f}), rms);
|
||||
|
||||
auto res = std::make_shared<ov::op::v1::Multiply>(input_node, scale);
|
||||
auto res = std::make_shared<ov::op::v1::Multiply>(input_node, reciprocal);
|
||||
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <openvino/core/node.hpp>
|
||||
|
|
@ -23,6 +22,10 @@
|
|||
#include "../node_context.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
#ifndef M_PI
|
||||
# define M_PI 3.14159265358979323846
|
||||
#endif
|
||||
|
||||
#define GGML_ROPE_TYPE_NEOX 2
|
||||
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
|
|
|
|||
|
|
@ -4,11 +4,13 @@
|
|||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <memory>
|
||||
#include <openvino/core/any.hpp>
|
||||
#include <openvino/core/graph_util.hpp>
|
||||
#include <openvino/core/type/float16.hpp>
|
||||
#include <openvino/frontend/manager.hpp>
|
||||
#include <openvino/openvino.hpp>
|
||||
#include <openvino/runtime/compiled_model.hpp>
|
||||
#include <openvino/runtime/intel_npu/properties.hpp>
|
||||
#include <openvino/runtime/tensor.hpp>
|
||||
#include <unordered_map>
|
||||
|
||||
|
|
@ -17,8 +19,8 @@
|
|||
#include "openvino/frontend.hpp"
|
||||
#include "openvino/input_model.hpp"
|
||||
|
||||
std::shared_ptr<GgmlOvDecoder> get_ggml_decoder(struct ggml_cgraph* cgraph) {
|
||||
return std::make_shared<GgmlOvDecoder>(nullptr, cgraph);
|
||||
std::shared_ptr<GgmlOvDecoder> get_ggml_decoder(struct ggml_cgraph* cgraph, bool is_static, bool is_first_token) {
|
||||
return std::make_shared<GgmlOvDecoder>(nullptr, cgraph, is_static, is_first_token);
|
||||
}
|
||||
|
||||
ov::Tensor get_ggml_graph_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, std::string& name) {
|
||||
|
|
@ -49,50 +51,63 @@ static ov::frontend::FrontEnd::Ptr get_ggml_frontend() {
|
|||
}
|
||||
|
||||
enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph) {
|
||||
static ov::Core core;
|
||||
static bool is_first_token = true;
|
||||
|
||||
static std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : "";
|
||||
if (device.empty()) {
|
||||
// Prefer GPU over CPU
|
||||
for (const auto& dev : core.get_available_devices()) {
|
||||
device = dev;
|
||||
if (device == "GPU")
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
bool is_static = device == "NPU" ? true : false;
|
||||
ov::AnyMap config;
|
||||
if (is_static) {
|
||||
config = {
|
||||
{"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=ReduceMean"},
|
||||
{"NPU_USE_NPUW", "YES"},
|
||||
{"NPUW_DEVICES", "NPU"},
|
||||
{"NPUW_FOLD", "YES"},
|
||||
// {"NPU_COMPILER_TYPE", "MLIR"},
|
||||
};
|
||||
}
|
||||
|
||||
auto start_time = ggml_time_us();
|
||||
|
||||
static ov::Core core;
|
||||
auto* cache_dir = getenv("GGML_OPENVINO_CACHE_DIR");
|
||||
if (cache_dir) {
|
||||
if (cache_dir && !is_static) {
|
||||
core.set_property(ov::cache_dir(cache_dir));
|
||||
}
|
||||
|
||||
// auto devices = core.get_available_devices();
|
||||
// static auto front_end = get_ggml_frontend();
|
||||
// if (!front_end) {
|
||||
// GGML_LOG_ERROR("GGML FrontEnd is not initialized \n");
|
||||
// return GGML_STATUS_FAILED;
|
||||
// }
|
||||
|
||||
using CachedItem = std::pair<std::shared_ptr<ov::Model>, ov::CompiledModel>;
|
||||
// For CPU and GPU, there is only one compiled model, so only use the first element of the pair
|
||||
// For NPU, there are prefill model and kvcache model (This is the ideal approach, but not implemented yet,
|
||||
// currently recompile for every token)
|
||||
using CachedItem = std::pair<std::shared_ptr<ov::Model>, std::pair<ov::CompiledModel, ov::CompiledModel>>;
|
||||
static std::unordered_map<struct ggml_cgraph*, CachedItem> compiled_cache;
|
||||
|
||||
std::shared_ptr<ov::Model> model;
|
||||
ov::CompiledModel compiled_model;
|
||||
ov::CompiledModel compiled_model_prefill;
|
||||
ov::CompiledModel compiled_model_kvcache;
|
||||
int64_t decoder_end_time;
|
||||
int64_t conversion_end_time;
|
||||
int64_t compile_end_time;
|
||||
|
||||
auto ggml_decoder = get_ggml_decoder(cgraph);
|
||||
auto ggml_decoder = get_ggml_decoder(cgraph, is_static, is_first_token);
|
||||
decoder_end_time = ggml_time_us();
|
||||
|
||||
auto it = compiled_cache.find(cgraph);
|
||||
if (it != compiled_cache.end()) {
|
||||
if (it != compiled_cache.end() && !is_static) {
|
||||
model = it->second.first;
|
||||
conversion_end_time = ggml_time_us();
|
||||
|
||||
compiled_model = it->second.second;
|
||||
compiled_model_prefill = it->second.second.first;
|
||||
compiled_model_kvcache = it->second.second.second;
|
||||
compile_end_time = ggml_time_us();
|
||||
} else {
|
||||
// std::shared_ptr<ov::frontend::DecoderBase> graph_decoder = ggml_decoder;
|
||||
// ov::frontend::InputModel::Ptr input_model = front_end->load(graph_decoder);
|
||||
// if (!input_model) {
|
||||
// GGML_LOG_ERROR("Input Model is not loaded \n");
|
||||
// return GGML_STATUS_FAILED;
|
||||
// }
|
||||
|
||||
// model = front_end->convert(input_model);
|
||||
|
||||
ov::frontend::InputModel::Ptr input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);
|
||||
model = ov::frontend::ggml::FrontEnd::convert(input_model);
|
||||
|
||||
|
|
@ -105,16 +120,23 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
|
|||
ov::serialize(model, timestamped_filename);
|
||||
}
|
||||
|
||||
if (!model) {
|
||||
GGML_LOG_ERROR("Model is not converted \n");
|
||||
}
|
||||
compiled_model = core.compile_model(model, "CPU");
|
||||
compiled_model_prefill = core.compile_model(model, device, config);
|
||||
compile_end_time = ggml_time_us();
|
||||
|
||||
compiled_cache[cgraph] = std::make_pair(model, compiled_model);
|
||||
compiled_cache[cgraph] = std::make_pair(model, std::make_pair(compiled_model_prefill, compiled_model_kvcache));
|
||||
}
|
||||
|
||||
ov::InferRequest infer_request = compiled_model.create_infer_request();
|
||||
ov::InferRequest infer_request;
|
||||
if (!is_static) {
|
||||
infer_request = compiled_model_prefill.create_infer_request();
|
||||
} else {
|
||||
infer_request = compiled_model_prefill.create_infer_request();
|
||||
// if (is_first_token) {
|
||||
// infer_request = compiled_model_prefill.create_infer_request();
|
||||
// } else {
|
||||
// infer_request = compiled_model_kvcache.create_infer_request();
|
||||
// }
|
||||
}
|
||||
|
||||
auto ov_params = model->get_parameters();
|
||||
for (size_t i = 0; i < ov_params.size(); i++) {
|
||||
|
|
@ -148,6 +170,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
|
|||
}
|
||||
auto end_time = ggml_time_us();
|
||||
|
||||
is_first_token = false;
|
||||
|
||||
if (getenv("GGML_OPENVINO_PROFILING")) {
|
||||
GGML_LOG_INFO("GGML OpenVINO Backend: \n");
|
||||
GGML_LOG_INFO(" - Graph decoder Time: %ld ms \n", (decoder_end_time - start_time) / 1000);
|
||||
|
|
|
|||
Loading…
Reference in New Issue