draft NPU support version 2: prefill + kvcache
This commit is contained in:
parent
7fec223334
commit
34531abce4
|
|
@ -108,22 +108,25 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) {
|
|||
ov::PartialShape input_shape;
|
||||
if (std::string(src->name) == "inp_tokens" || std::string(src->name) == "inp_pos") {
|
||||
if (m_is_static) {
|
||||
input_shape = ov::PartialShape(get_shape(src));
|
||||
// if (m_is_first_token) {
|
||||
// input_shape = ov::PartialShape{1, 1, m_max_token_len};
|
||||
// } else {
|
||||
// input_shape = ov::PartialShape{1, 1, 1};
|
||||
// }
|
||||
if (m_is_first_token) {
|
||||
input_shape = ov::PartialShape{1, 1, m_max_token_len};
|
||||
} else {
|
||||
input_shape = ov::PartialShape{1, 1, 1};
|
||||
}
|
||||
} else {
|
||||
input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_max_token_len)};
|
||||
}
|
||||
} else if (std::string(src->name).find("KQ_mask") == 0) {
|
||||
} else if (std::string(src->name) == "KQ_mask") {
|
||||
if (m_is_static) {
|
||||
input_shape = ov::PartialShape(get_shape(src));
|
||||
if (m_is_first_token) {
|
||||
input_shape = ov::PartialShape{1, m_max_token_len, m_max_token_len};
|
||||
} else {
|
||||
input_shape = ov::PartialShape{1, 1, m_max_token_len};
|
||||
}
|
||||
} else {
|
||||
auto max_token_len = GGML_PAD(m_max_token_len, GGML_KQ_MASK_PAD);
|
||||
auto max_mask_size = GGML_PAD(m_max_token_len, GGML_KQ_MASK_PAD);
|
||||
input_shape =
|
||||
ov::PartialShape{1, ov::Dimension(1, max_token_len), ov::Dimension(1, max_token_len)};
|
||||
ov::PartialShape{1, ov::Dimension(1, max_mask_size), ov::Dimension(1, max_mask_size)};
|
||||
}
|
||||
} else {
|
||||
input_shape = ov::Shape{get_shape(src)};
|
||||
|
|
@ -208,6 +211,7 @@ void GgmlOvDecoder::set_max_token_len() {
|
|||
|
||||
void GgmlOvDecoder::add_extra_inputs() {
|
||||
int64_t past_token_len;
|
||||
// attention_size not used for NPU
|
||||
int64_t attention_size;
|
||||
|
||||
for (const auto& node : m_nodes) {
|
||||
|
|
@ -231,8 +235,7 @@ void GgmlOvDecoder::add_extra_inputs() {
|
|||
for (const auto& node : m_nodes) {
|
||||
if (node->src[1] && std::string(node->src[1]->name).find("inp_tokens") == 0) {
|
||||
int64_t total_token_len = node->src[1]->ne[0] + past_token_len;
|
||||
attention_size = (total_token_len + 31) / 32 * 32;
|
||||
|
||||
attention_size = GGML_PAD(total_token_len, 32);
|
||||
std::string name = "attention_size";
|
||||
auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{1});
|
||||
param_node->set_friendly_name(name);
|
||||
|
|
|
|||
|
|
@ -92,9 +92,12 @@ public:
|
|||
virtual bool is_static() const override {
|
||||
return m_is_static;
|
||||
}
|
||||
virtual bool is_first_token() const {
|
||||
virtual bool is_first_token() const override {
|
||||
return m_is_first_token;
|
||||
}
|
||||
virtual int get_max_token_len() const override {
|
||||
return m_max_token_len;
|
||||
}
|
||||
|
||||
private:
|
||||
void set_input_output(ggml_tensor* node);
|
||||
|
|
@ -106,7 +109,7 @@ private:
|
|||
static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor);
|
||||
|
||||
void set_max_token_len();
|
||||
int64_t m_max_token_len;
|
||||
int m_max_token_len;
|
||||
|
||||
void add_weight_const_parallel(std::map<std::string, std::shared_ptr<ov::Node>>& model_weights);
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <map>
|
||||
#include <openvino/core/node.hpp>
|
||||
#include <openvino/frontend/decoder.hpp>
|
||||
|
|
@ -57,6 +58,8 @@ public:
|
|||
virtual const std::vector<std::string>& get_model_output_names() const = 0;
|
||||
|
||||
virtual bool is_static() const = 0;
|
||||
virtual bool is_first_token() const = 0;
|
||||
virtual int get_max_token_len() const = 0;
|
||||
};
|
||||
|
||||
} // namespace ggml
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <openvino/frontend/node_context.hpp>
|
||||
|
||||
#include "decoder.hpp"
|
||||
|
|
@ -87,6 +88,12 @@ public:
|
|||
bool is_static() const {
|
||||
return m_decoder->is_static();
|
||||
}
|
||||
bool is_first_token() const {
|
||||
return m_decoder->is_first_token();
|
||||
}
|
||||
int get_max_token_len() const {
|
||||
return m_decoder->get_max_token_len();
|
||||
}
|
||||
|
||||
private:
|
||||
std::shared_ptr<GgmlDecoder> m_decoder;
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@
|
|||
#include <openvino/op/broadcast.hpp>
|
||||
#include <openvino/op/concat.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/convert_like.hpp>
|
||||
#include <openvino/op/convert.hpp>
|
||||
#include <openvino/op/range.hpp>
|
||||
#include <openvino/op/reshape.hpp>
|
||||
#include <openvino/op/scatter_nd_update.hpp>
|
||||
|
|
@ -34,18 +34,26 @@ OutputVector translate_cpy(const NodeContext& context) {
|
|||
|
||||
auto src0 = context.get_input(0);
|
||||
auto src1 = context.get_input(1);
|
||||
auto past_token_len = context.get_input("past_token_len");
|
||||
auto past_token_len_scalar = context.get_input("past_token_len");
|
||||
|
||||
src0 = std::make_shared<ov::op::v0::Convert>(src0, context.get_input_type(1));
|
||||
ov::Output<Node> res;
|
||||
|
||||
if (context.is_static() && context.is_first_token()) {
|
||||
res = src0;
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
||||
auto src0_shape = context.get_input_shape(0).to_shape();
|
||||
auto output_shape = context.get_output_shape(0).to_shape();
|
||||
|
||||
std::vector<size_t> input0_strides = context.get_input_stride(0);
|
||||
std::vector<size_t> output_strides = context.get_output_stride(0);
|
||||
|
||||
auto one = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1});
|
||||
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
|
||||
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
|
||||
auto one_scalar = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1});
|
||||
|
||||
src0 = std::make_shared<ov::op::v1::ConvertLike>(src0, src1);
|
||||
if (op_case == 1) {
|
||||
// Write K to cache_k
|
||||
int64_t head_size = src0_shape[2];
|
||||
|
|
@ -56,32 +64,29 @@ OutputVector translate_cpy(const NodeContext& context) {
|
|||
auto reshaped_src1 = std::make_shared<ov::op::v1::Reshape>(src1, reshaped_src1_shape, false);
|
||||
|
||||
auto token_len = get_dimensions(src0.get_node_shared_ptr(), {0});
|
||||
token_len = std::make_shared<ov::op::v1::Reshape>(token_len,
|
||||
ov::op::v0::Constant::create(ov::element::i64, {0}, {}),
|
||||
false);
|
||||
auto token_len_scalar = std::make_shared<ov::op::v0::Squeeze>(token_len, zero);
|
||||
|
||||
std::shared_ptr<ov::Node> indices;
|
||||
if (context.is_static()) {
|
||||
int32_t* op_params = context.get_input_op_params(1);
|
||||
int64_t past_token_len_val = op_params[0] / context.get_input_stride(1)[2] / num_heads / head_size;
|
||||
past_token_len = ov::op::v0::Constant::create(ov::element::i64, {}, {past_token_len_val});
|
||||
indices = past_token_len_scalar.get_node_shared_ptr();
|
||||
indices = std::make_shared<ov::op::v0::Unsqueeze>(
|
||||
indices,
|
||||
ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector<int64_t>{0, 1}));
|
||||
} else {
|
||||
auto total_token_len_scalar = std::make_shared<ov::op::v1::Add>(past_token_len_scalar, token_len_scalar);
|
||||
indices = std::make_shared<ov::op::v4::Range>(past_token_len_scalar,
|
||||
total_token_len_scalar,
|
||||
one_scalar,
|
||||
ov::element::i64);
|
||||
indices = std::make_shared<ov::op::v0::Unsqueeze>(indices, one);
|
||||
}
|
||||
|
||||
auto total_token_len = std::make_shared<ov::op::v1::Add>(past_token_len, token_len);
|
||||
std::shared_ptr<ov::Node> indices =
|
||||
std::make_shared<ov::op::v4::Range>(past_token_len, total_token_len, one, ov::element::i64);
|
||||
indices = std::make_shared<ov::op::v0::Unsqueeze>(
|
||||
indices,
|
||||
ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{1}));
|
||||
|
||||
res = std::make_shared<ov::op::v3::ScatterNDUpdate>(reshaped_src1, indices, src0);
|
||||
} else {
|
||||
// Write V to cache_v
|
||||
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
|
||||
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
|
||||
auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
|
||||
|
||||
auto zero_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {0});
|
||||
auto one_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {1});
|
||||
|
||||
int64_t total_head_size = src0_shape[1];
|
||||
auto total_head_size_node = ov::op::v0::Constant::create(ov::element::i64, {1}, {total_head_size});
|
||||
|
|
@ -89,36 +94,6 @@ OutputVector translate_cpy(const NodeContext& context) {
|
|||
|
||||
auto token_len = get_dimensions(src0.get_node_shared_ptr(), {2});
|
||||
auto token_len_scalar = std::make_shared<ov::op::v0::Squeeze>(token_len, zero);
|
||||
if (context.is_static()) {
|
||||
int32_t* op_params = context.get_input_op_params(1);
|
||||
int64_t past_token_len_val = op_params[0] / context.get_input_stride(1)[2];
|
||||
past_token_len = ov::op::v0::Constant::create(ov::element::i64, {}, {past_token_len_val});
|
||||
}
|
||||
auto total_token_len_scalar = std::make_shared<ov::op::v1::Add>(past_token_len, token_len_scalar);
|
||||
|
||||
// auto reshaped_src1 = std::make_shared<ov::op::v1::Reshape>(
|
||||
// src1,
|
||||
// ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{1, total_head_size, -1}),
|
||||
// false);
|
||||
|
||||
// auto src1_left = std::make_shared<ov::op::v8::Slice>(
|
||||
// reshaped_src1,
|
||||
// ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, 0}),
|
||||
// std::make_shared<ov::op::v0::Concat>(ov::OutputVector{one, total_head_size_node, past_token_len}, 0),
|
||||
// ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1}));
|
||||
|
||||
// auto src1_right = std::make_shared<ov::op::v8::Slice>(
|
||||
// reshaped_src1,
|
||||
// std::make_shared<ov::op::v0::Concat>(ov::OutputVector{zero, zero, total_token_len}, 0),
|
||||
// ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{1, total_head_size, INT_MAX}),
|
||||
// ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1}));
|
||||
|
||||
// auto reshaped_src0 = std::make_shared<ov::op::v1::Reshape>(
|
||||
// src0,
|
||||
// ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{1, total_head_size, -1}),
|
||||
// false);
|
||||
|
||||
// auto res = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{src1_left, reshaped_src0, src1_right}, 2);
|
||||
|
||||
// 1D tensor of shape [total_head_size], values starting from 0
|
||||
auto range_row =
|
||||
|
|
@ -131,8 +106,19 @@ OutputVector translate_cpy(const NodeContext& context) {
|
|||
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{total_head_size_node, token_len, one}, 0));
|
||||
|
||||
// 1D tensor of shape [token_len], values starting from past_token_len
|
||||
auto range_col =
|
||||
std::make_shared<ov::op::v4::Range>(past_token_len, total_token_len_scalar, one_scalar, element::i64);
|
||||
std::shared_ptr<ov::Node> range_col;
|
||||
if (context.is_static()) {
|
||||
range_col = past_token_len_scalar.get_node_shared_ptr();
|
||||
range_col = std::make_shared<ov::op::v0::Unsqueeze>(
|
||||
range_col,
|
||||
ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{0}));
|
||||
} else {
|
||||
auto total_token_len_scalar = std::make_shared<ov::op::v1::Add>(past_token_len_scalar, token_len_scalar);
|
||||
range_col = std::make_shared<ov::op::v4::Range>(past_token_len_scalar,
|
||||
total_token_len_scalar,
|
||||
one_scalar,
|
||||
ov::element::i64);
|
||||
}
|
||||
auto range_col_reshaped =
|
||||
std::make_shared<ov::op::v0::Unsqueeze>(range_col,
|
||||
ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 2}));
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
#include "utils.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
|
|
@ -13,6 +15,7 @@
|
|||
#include <openvino/runtime/intel_npu/properties.hpp>
|
||||
#include <openvino/runtime/tensor.hpp>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml.h"
|
||||
|
|
@ -52,7 +55,6 @@ static ov::frontend::FrontEnd::Ptr get_ggml_frontend() {
|
|||
|
||||
enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph) {
|
||||
static ov::Core core;
|
||||
static bool is_first_token = true;
|
||||
|
||||
static std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : "";
|
||||
if (device.empty()) {
|
||||
|
|
@ -66,12 +68,16 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
|
|||
|
||||
bool is_static = device == "NPU" ? true : false;
|
||||
ov::AnyMap config;
|
||||
if (is_static) {
|
||||
if (device == "NPU") {
|
||||
config = {
|
||||
{"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=ReduceMean"},
|
||||
{"NPU_USE_NPUW", "YES"},
|
||||
{"NPUW_DEVICES", "NPU"},
|
||||
{"NPUW_FOLD", "YES"},
|
||||
{"NPUW_DQ", "YES"},
|
||||
{"NPUW_FUNCALL_ASYNC", "YES"},
|
||||
{"NPUW_HOST_GATHER", "YES"},
|
||||
{"NPUW_WEIGHTS_BANK", "shared"},
|
||||
// {"NPU_COMPILER_TYPE", "MLIR"},
|
||||
};
|
||||
}
|
||||
|
|
@ -83,69 +89,128 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
|
|||
core.set_property(ov::cache_dir(cache_dir));
|
||||
}
|
||||
|
||||
// For CPU and GPU, there is only one compiled model, so only use the first element of the pair
|
||||
// For NPU, there are prefill model and kvcache model (This is the ideal approach, but not implemented yet,
|
||||
// currently recompile for every token)
|
||||
using CachedItem = std::pair<std::shared_ptr<ov::Model>, std::pair<ov::CompiledModel, ov::CompiledModel>>;
|
||||
static std::unordered_map<struct ggml_cgraph*, CachedItem> compiled_cache;
|
||||
// CPU and GPU will only use cache_prefill
|
||||
using CachedItem = std::pair<std::shared_ptr<ov::Model>, ov::CompiledModel>;
|
||||
static std::unordered_map<struct ggml_cgraph*, CachedItem> compiled_cache_prefill;
|
||||
static std::unordered_map<struct ggml_cgraph*, CachedItem> compiled_cache_kvcache;
|
||||
|
||||
std::shared_ptr<GgmlOvDecoder> ggml_decoder;
|
||||
std::shared_ptr<ov::Model> model;
|
||||
ov::CompiledModel compiled_model_prefill;
|
||||
ov::CompiledModel compiled_model_kvcache;
|
||||
ov::CompiledModel compiled_model;
|
||||
|
||||
int64_t decoder_end_time;
|
||||
int64_t conversion_end_time;
|
||||
int64_t compile_end_time;
|
||||
|
||||
auto ggml_decoder = get_ggml_decoder(cgraph, is_static, is_first_token);
|
||||
decoder_end_time = ggml_time_us();
|
||||
auto it = compiled_cache_prefill.find(cgraph);
|
||||
bool is_first_token = it == compiled_cache_prefill.end();
|
||||
if (!is_first_token) {
|
||||
ggml_decoder = get_ggml_decoder(cgraph, is_static, false);
|
||||
decoder_end_time = ggml_time_us();
|
||||
|
||||
auto it = compiled_cache.find(cgraph);
|
||||
if (it != compiled_cache.end() && !is_static) {
|
||||
model = it->second.first;
|
||||
conversion_end_time = ggml_time_us();
|
||||
|
||||
compiled_model_prefill = it->second.second.first;
|
||||
compiled_model_kvcache = it->second.second.second;
|
||||
compile_end_time = ggml_time_us();
|
||||
} else {
|
||||
ov::frontend::InputModel::Ptr input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);
|
||||
model = ov::frontend::ggml::FrontEnd::convert(input_model);
|
||||
|
||||
conversion_end_time = ggml_time_us();
|
||||
|
||||
if (getenv("GGML_OPENVINO_DUMP_IR")) {
|
||||
char timestamped_filename[64];
|
||||
auto timestamp = (long long)ggml_time_us();
|
||||
snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp);
|
||||
ov::serialize(model, timestamped_filename);
|
||||
if (is_static) {
|
||||
model = compiled_cache_kvcache[cgraph].first;
|
||||
compiled_model = compiled_cache_kvcache[cgraph].second;
|
||||
} else {
|
||||
model = it->second.first;
|
||||
compiled_model = it->second.second;
|
||||
}
|
||||
|
||||
compiled_model_prefill = core.compile_model(model, device, config);
|
||||
compile_end_time = ggml_time_us();
|
||||
|
||||
compiled_cache[cgraph] = std::make_pair(model, std::make_pair(compiled_model_prefill, compiled_model_kvcache));
|
||||
}
|
||||
|
||||
ov::InferRequest infer_request;
|
||||
if (!is_static) {
|
||||
infer_request = compiled_model_prefill.create_infer_request();
|
||||
conversion_end_time = ggml_time_us();
|
||||
compile_end_time = conversion_end_time;
|
||||
} else {
|
||||
infer_request = compiled_model_prefill.create_infer_request();
|
||||
// if (is_first_token) {
|
||||
// infer_request = compiled_model_prefill.create_infer_request();
|
||||
// } else {
|
||||
// infer_request = compiled_model_kvcache.create_infer_request();
|
||||
// }
|
||||
if (is_static) {
|
||||
ggml_decoder = get_ggml_decoder(cgraph, is_static, true);
|
||||
auto ggml_decoder_kvcache = get_ggml_decoder(cgraph, is_static, false);
|
||||
decoder_end_time = ggml_time_us();
|
||||
|
||||
auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);
|
||||
auto input_model_kvcache = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder_kvcache);
|
||||
|
||||
model = ov::frontend::ggml::FrontEnd::convert(input_model);
|
||||
auto model_kvcache = ov::frontend::ggml::FrontEnd::convert(input_model_kvcache);
|
||||
conversion_end_time = ggml_time_us();
|
||||
|
||||
compiled_model = core.compile_model(model, device, config);
|
||||
auto compiled_model_kvcache = core.compile_model(model_kvcache, device, config);
|
||||
compile_end_time = ggml_time_us();
|
||||
|
||||
compiled_cache_prefill[cgraph] = std::make_pair(model, compiled_model);
|
||||
compiled_cache_kvcache[cgraph] = std::make_pair(model_kvcache, compiled_model_kvcache);
|
||||
|
||||
if (getenv("GGML_OPENVINO_DUMP_IR")) {
|
||||
char timestamped_filename[64];
|
||||
auto timestamp = (long long)ggml_time_us();
|
||||
snprintf(timestamped_filename, sizeof(timestamped_filename), "model_prefill_%lld.xml", timestamp);
|
||||
ov::serialize(model, timestamped_filename);
|
||||
snprintf(timestamped_filename, sizeof(timestamped_filename), "model_kvcache_%lld.xml", timestamp);
|
||||
ov::serialize(model_kvcache, timestamped_filename);
|
||||
}
|
||||
} else {
|
||||
ggml_decoder = get_ggml_decoder(cgraph, is_static, true);
|
||||
decoder_end_time = ggml_time_us();
|
||||
|
||||
auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);
|
||||
model = ov::frontend::ggml::FrontEnd::convert(input_model);
|
||||
conversion_end_time = ggml_time_us();
|
||||
|
||||
compiled_model = core.compile_model(model, device, config);
|
||||
compile_end_time = ggml_time_us();
|
||||
compiled_cache_prefill[cgraph] = std::make_pair(model, compiled_model);
|
||||
|
||||
if (getenv("GGML_OPENVINO_DUMP_IR")) {
|
||||
char timestamped_filename[64];
|
||||
auto timestamp = (long long)ggml_time_us();
|
||||
snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp);
|
||||
ov::serialize(model, timestamped_filename);
|
||||
}
|
||||
}
|
||||
}
|
||||
auto infer_request = compiled_model.create_infer_request();
|
||||
|
||||
auto ov_params = model->get_parameters();
|
||||
for (size_t i = 0; i < ov_params.size(); i++) {
|
||||
auto param_name = ov_params[i]->get_friendly_name();
|
||||
ov::Tensor input_tensor;
|
||||
|
||||
if (ggml_decoder->get_model_extra_inputs().find(param_name) != ggml_decoder->get_model_extra_inputs().end()) {
|
||||
input_tensor = *ggml_decoder->get_model_extra_input_values().at(param_name);
|
||||
} else {
|
||||
|
||||
} else if (!is_static) {
|
||||
input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name);
|
||||
|
||||
} else {
|
||||
if (param_name == "inp_tokens" || param_name == "inp_pos") {
|
||||
if (is_first_token) {
|
||||
size_t max_token_len = ggml_decoder->get_max_token_len();
|
||||
const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name);
|
||||
std::vector<int32_t> padded_data = pad_input<int32_t>(input_tensor_ggml, 1, max_token_len, 0);
|
||||
input_tensor = ov::Tensor(ov::element::i32, ov::Shape{1, 1, max_token_len});
|
||||
auto* data_ptr = input_tensor.data<int32_t>();
|
||||
std::copy(padded_data.begin(), padded_data.end(), data_ptr);
|
||||
} else {
|
||||
input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name);
|
||||
}
|
||||
|
||||
} else if (param_name == "KQ_mask") {
|
||||
size_t max_token_len = ggml_decoder->get_max_token_len();
|
||||
const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name);
|
||||
if (is_first_token) {
|
||||
std::vector<float> padded_data =
|
||||
pad_input<float>(input_tensor_ggml, max_token_len, max_token_len, -INFINITY);
|
||||
set_zero_diagonal(padded_data, max_token_len);
|
||||
input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, max_token_len, max_token_len});
|
||||
auto* data_ptr = input_tensor.data<float>();
|
||||
std::copy(padded_data.begin(), padded_data.end(), data_ptr);
|
||||
} else {
|
||||
std::vector<float> padded_data = pad_input<float>(input_tensor_ggml, 1, max_token_len, -INFINITY);
|
||||
input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 1, max_token_len});
|
||||
auto* data_ptr = input_tensor.data<float>();
|
||||
std::copy(padded_data.begin(), padded_data.end(), data_ptr);
|
||||
}
|
||||
|
||||
} else {
|
||||
input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name);
|
||||
}
|
||||
}
|
||||
infer_request.set_input_tensor(i, input_tensor);
|
||||
|
||||
|
|
@ -234,3 +299,9 @@ void print_output_tensor_info(const std::string& name,
|
|||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void set_zero_diagonal(std::vector<float>& matrix, size_t dim) {
|
||||
for (size_t i = 0; i < dim; ++i) {
|
||||
matrix[i * dim + i] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,12 +1,37 @@
|
|||
#include <algorithm>
|
||||
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-decoder.h"
|
||||
|
||||
enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph);
|
||||
|
||||
std::shared_ptr<GgmlOvDecoder> get_ggml_decoder(struct ggml_cgraph* cgraph, bool is_static, bool is_first_token);
|
||||
|
||||
ov::Tensor get_ggml_graph_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, std::string& name);
|
||||
|
||||
std::map<std::string, void*> get_ggml_graph_output_dst(std::shared_ptr<GgmlOvDecoder> ggml_decoder);
|
||||
|
||||
size_t checksum(const void* data, size_t size);
|
||||
|
||||
void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor);
|
||||
|
||||
void print_output_tensor_info(const std::string& name,
|
||||
const ov::Tensor& tensor,
|
||||
std::map<std::string, void*>& output_dst);
|
||||
std::map<std::string, void*>& output_dst);
|
||||
|
||||
template <typename T>
|
||||
std::vector<T> pad_input(const ggml_tensor* tensor, size_t padded_rows, size_t padded_cols, T pad_value) {
|
||||
std::vector<T> padded_data(padded_rows * padded_cols, pad_value);
|
||||
size_t rows = tensor->ne[1];
|
||||
size_t cols = tensor->ne[0];
|
||||
T* data = static_cast<T*>(tensor->data);
|
||||
|
||||
for (size_t i = 0; i < std::min(rows, padded_rows); ++i) {
|
||||
for (size_t j = 0; j < std::min(cols, padded_cols); ++j) {
|
||||
padded_data[i * padded_cols + j] = data[i * cols + j];
|
||||
}
|
||||
}
|
||||
return padded_data;
|
||||
}
|
||||
|
||||
void set_zero_diagonal(std::vector<float>& matrix, size_t dim);
|
||||
|
|
|
|||
Loading…
Reference in New Issue