Refactor: clean, fix warning
This commit is contained in:
parent
42d4240937
commit
593484ce5f
|
|
@ -140,7 +140,7 @@ int main(int argc, char ** argv) {
|
|||
std::string s(buf, n);
|
||||
printf("%s", s.c_str());
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
// prepare a batch for the prompt
|
||||
|
||||
llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
|
||||
|
|
|
|||
|
|
@ -247,8 +247,6 @@ set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
|
|||
"ggml: sycl device architecture")
|
||||
|
||||
option(GGML_OPENVINO "ggml: use OPENVINO" OFF)
|
||||
option(GGML_OPENVINO_DEBUG "ggml: enable OPENVINO debugging" OFF)
|
||||
option(GGML_OV_FRONTEND "ggml: OPENVINO frontend path" ON)
|
||||
|
||||
option(GGML_OPENCL "ggml: use OpenCL" OFF)
|
||||
option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increases overhead)" OFF)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
AlignConsecutiveAssignments: false
|
||||
AlignConsecutiveDeclarations: false
|
||||
ReferenceAlignment: Left
|
||||
|
|
@ -225,9 +225,9 @@ void GgmlOvDecoder::set_max_token_len() {
|
|||
}
|
||||
|
||||
void GgmlOvDecoder::add_extra_inputs() {
|
||||
int64_t past_token_len;
|
||||
int64_t past_token_len = -1;
|
||||
// attention_size not used for NPU
|
||||
int64_t attention_size;
|
||||
int64_t attention_size = -1;
|
||||
|
||||
for (const auto& node : m_nodes) {
|
||||
if (node->op == GGML_OP_CPY && ggml_is_contiguous(node)) {
|
||||
|
|
@ -247,6 +247,9 @@ void GgmlOvDecoder::add_extra_inputs() {
|
|||
break;
|
||||
}
|
||||
}
|
||||
if (past_token_len == -1) {
|
||||
throw std::runtime_error("Failed to find input \"cache_k\" in the graph");
|
||||
}
|
||||
for (const auto& node : m_nodes) {
|
||||
if (node->src[1] && std::string(node->src[1]->name).find("inp_tokens") == 0) {
|
||||
int64_t total_token_len = node->src[1]->ne[0] + past_token_len;
|
||||
|
|
|
|||
|
|
@ -61,11 +61,11 @@ public:
|
|||
|
||||
virtual void visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>)> node_visitor) const override;
|
||||
|
||||
const ggml_tensor* get_input_ggml_tensor(std::string& name) const {
|
||||
const ggml_tensor* get_input_ggml_tensor(const std::string& name) const {
|
||||
return m_inputs.at(name);
|
||||
}
|
||||
|
||||
const ggml_tensor* get_output_ggml_tensor(std::string& name) const {
|
||||
const ggml_tensor* get_output_ggml_tensor(const std::string& name) const {
|
||||
return m_outputs.at(name);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,22 +0,0 @@
|
|||
#include <openvino/op/add.hpp>
|
||||
|
||||
#include "../node_context.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_add(const NodeContext& context) {
|
||||
num_inputs_check(context, 2, 2);
|
||||
|
||||
auto res = std::make_shared<ov::op::v1::Add>(context.get_input(0), context.get_input(1));
|
||||
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
||||
} // namespace op
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -7,6 +7,7 @@
|
|||
#include <vector>
|
||||
|
||||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
|
|
|
|||
|
|
@ -19,6 +19,7 @@
|
|||
#include <vector>
|
||||
|
||||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@
|
|||
#include <vector>
|
||||
|
||||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
|
|
|
|||
|
|
@ -1,21 +0,0 @@
|
|||
#include <openvino/op/multiply.hpp>
|
||||
|
||||
#include "../node_context.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_mul(const NodeContext& context) {
|
||||
num_inputs_check(context, 2, 2);
|
||||
|
||||
auto res = std::make_shared<ov::op::v1::Multiply>(context.get_input(0), context.get_input(1));
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
||||
} // namespace op
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -13,6 +13,7 @@
|
|||
#include <vector>
|
||||
|
||||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@
|
|||
#include <openvino/op/transpose.hpp>
|
||||
|
||||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
|
|
@ -25,9 +26,8 @@ OutputVector translate_permute(const NodeContext& context) {
|
|||
|
||||
if (op_case == 1) {
|
||||
auto perm = argsort_descend(context.get_output_stride(0));
|
||||
auto res = std::make_shared<ov::op::v1::Transpose>(context.get_input(0),
|
||||
ov::op::v0::Constant::create(ov::element::i64, {3}, perm));
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
res = std::make_shared<ov::op::v1::Transpose>(context.get_input(0),
|
||||
ov::op::v0::Constant::create(ov::element::i64, { 3 }, perm));
|
||||
} else {
|
||||
auto src = context.get_input(0);
|
||||
auto attention_size = context.get_input("attention_size");
|
||||
|
|
@ -70,8 +70,8 @@ OutputVector translate_permute(const NodeContext& context) {
|
|||
} else {
|
||||
res = src_slice;
|
||||
}
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
return rename_outputs_with_suffix({ res }, context.get_name());
|
||||
}
|
||||
|
||||
} // namespace op
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@
|
|||
#include <vector>
|
||||
|
||||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@
|
|||
#include <openvino/op/sqrt.hpp>
|
||||
|
||||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
|
|
|
|||
|
|
@ -20,6 +20,7 @@
|
|||
#include <vector>
|
||||
|
||||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
#ifndef M_PI
|
||||
|
|
@ -36,21 +37,19 @@ namespace frontend {
|
|||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
static float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
|
||||
return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
|
||||
namespace {
|
||||
float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
|
||||
return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float) M_PI)) / (2 * logf(base));
|
||||
}
|
||||
|
||||
void ggml_rope_yarn_corr_dims(int n_dims,
|
||||
int n_ctx_orig,
|
||||
float freq_base,
|
||||
float beta_fast,
|
||||
float beta_slow,
|
||||
void ggml_rope_yarn_corr_dims(int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow,
|
||||
float dims[2]) {
|
||||
float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base));
|
||||
float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base));
|
||||
dims[0] = MAX(0, start);
|
||||
dims[1] = MIN(n_dims - 1, end);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
OutputVector translate_rope(const NodeContext& context) {
|
||||
num_inputs_check(context, 2, 3);
|
||||
|
|
@ -67,7 +66,12 @@ OutputVector translate_rope(const NodeContext& context) {
|
|||
|
||||
auto output_shape = context.get_output_shape(0);
|
||||
|
||||
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
||||
float freq_base;
|
||||
float freq_scale;
|
||||
float ext_factor;
|
||||
float attn_factor;
|
||||
float beta_fast;
|
||||
float beta_slow;
|
||||
int32_t* op_params = context.get_output_op_params(0);
|
||||
const int n_dims = op_params[1];
|
||||
const int mode = op_params[2];
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
#include <vector>
|
||||
|
||||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
|
|
|
|||
|
|
@ -1,5 +1,3 @@
|
|||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <openvino/core/node.hpp>
|
||||
#include <openvino/core/node_output.hpp>
|
||||
|
|
@ -13,6 +11,7 @@
|
|||
#include <vector>
|
||||
|
||||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
|
|
@ -28,18 +27,18 @@ OutputVector translate_soft_max(const NodeContext& context) {
|
|||
|
||||
float scale = 1.0f;
|
||||
float max_bias = 0.0f;
|
||||
auto op_params = context.get_output_op_params(0);
|
||||
auto * op_params = context.get_output_op_params(0);
|
||||
memcpy(&scale, (float*)op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (float*)op_params + 1, sizeof(float));
|
||||
|
||||
const uint32_t n_head = context.get_input_shape(0)[0].get_length();
|
||||
const uint32_t n_head_log2 = 1u << (uint32_t)floor(log2(n_head));
|
||||
// const uint32_t n_head = context.get_input_shape(0)[0].get_length();
|
||||
// const uint32_t n_head_log2 = 1u << (uint32_t)floor(log2(n_head));
|
||||
|
||||
// const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||
// const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
const float slope = (max_bias > 0.0f) ? 1.0f : 1.0f;
|
||||
// const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1)
|
||||
// : 1.0f;
|
||||
const float slope = 1.0;
|
||||
|
||||
if (scale != 1.0f) {
|
||||
auto scale_node =
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#include <openvino/op/transpose.hpp>
|
||||
|
||||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
#include <openvino/op/sigmoid.hpp>
|
||||
|
||||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
|
|
|
|||
|
|
@ -9,55 +9,31 @@
|
|||
|
||||
#include "utils.hpp"
|
||||
|
||||
using namespace ov::op;
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
|
||||
namespace op {
|
||||
|
||||
#define GGML_OP_CONVERTER(op) OutputVector op(const NodeContext& node)
|
||||
|
||||
GGML_OP_CONVERTER(translate_add);
|
||||
GGML_OP_CONVERTER(translate_cont);
|
||||
GGML_OP_CONVERTER(translate_cpy);
|
||||
GGML_OP_CONVERTER(translate_get_rows);
|
||||
GGML_OP_CONVERTER(translate_mul);
|
||||
GGML_OP_CONVERTER(translate_mulmat);
|
||||
GGML_OP_CONVERTER(translate_permute);
|
||||
GGML_OP_CONVERTER(translate_reshape);
|
||||
GGML_OP_CONVERTER(translate_rms_norm);
|
||||
GGML_OP_CONVERTER(translate_rope);
|
||||
GGML_OP_CONVERTER(translate_scale);
|
||||
GGML_OP_CONVERTER(translate_unary_silu);
|
||||
GGML_OP_CONVERTER(translate_soft_max);
|
||||
GGML_OP_CONVERTER(translate_transpose);
|
||||
GGML_OP_CONVERTER(translate_unary);
|
||||
GGML_OP_CONVERTER(translate_view);
|
||||
|
||||
} // namespace op
|
||||
|
||||
std::unordered_map<std::string, CreatorFunction> get_supported_ops() {
|
||||
return {{"GGML_OP_ADD", op::translate_1to1_match_2_inputs<v1::Add>},
|
||||
{"GGML_OP_ADD1", op::translate_1to1_match_2_inputs<v1::Add>},
|
||||
{"GGML_OP_CONT", op::translate_cont},
|
||||
{"GGML_OP_CPY", op::translate_cpy},
|
||||
{"GGML_OP_DIV", op::translate_1to1_match_2_inputs<v1::Divide>},
|
||||
{"GGML_OP_GET_ROWS", op::translate_get_rows},
|
||||
// {"GGML_OP_MUL", op::translate_1to1_match_2_inputs<v1::Multiply>},
|
||||
{"GGML_OP_MUL", op::translate_mul},
|
||||
{"GGML_OP_MUL_MAT", op::translate_mulmat},
|
||||
{"GGML_OP_PERMUTE", op::translate_permute},
|
||||
{"GGML_OP_RESHAPE", op::translate_reshape},
|
||||
{"GGML_OP_RMS_NORM", op::translate_rms_norm},
|
||||
{"GGML_OP_ROPE", op::translate_rope},
|
||||
{"GGML_OP_SCALE", op::translate_scale},
|
||||
{"GGML_OP_SOFT_MAX", op::translate_soft_max},
|
||||
{"GGML_OP_SUB", op::translate_1to1_match_2_inputs<v1::Subtract>},
|
||||
{"GGML_OP_TRANSPOSE", op::translate_transpose},
|
||||
{"GGML_UNARY_OP_SILU", op::translate_unary_silu},
|
||||
{"GGML_OP_VIEW", op::translate_view}};
|
||||
};
|
||||
using namespace ov::op;
|
||||
return {{"GGML_OP_ADD", op::translate_1to1_match_2_inputs<v1::Add>},
|
||||
{"GGML_OP_ADD1", op::translate_1to1_match_2_inputs<v1::Add>},
|
||||
{"GGML_OP_CONT", op::translate_cont},
|
||||
{"GGML_OP_CPY", op::translate_cpy},
|
||||
{"GGML_OP_DIV", op::translate_1to1_match_2_inputs<v1::Divide>},
|
||||
{"GGML_OP_GET_ROWS", op::translate_get_rows},
|
||||
{"GGML_OP_MUL", op::translate_1to1_match_2_inputs<v1::Multiply>},
|
||||
{"GGML_OP_MUL_MAT", op::translate_mulmat},
|
||||
{"GGML_OP_PERMUTE", op::translate_permute},
|
||||
{"GGML_OP_RESHAPE", op::translate_reshape},
|
||||
{"GGML_OP_RMS_NORM", op::translate_rms_norm},
|
||||
{"GGML_OP_ROPE", op::translate_rope},
|
||||
{"GGML_OP_SCALE", op::translate_scale},
|
||||
{"GGML_OP_SOFT_MAX", op::translate_soft_max},
|
||||
{"GGML_OP_SUB", op::translate_1to1_match_2_inputs<v1::Subtract>},
|
||||
{"GGML_OP_TRANSPOSE", op::translate_transpose},
|
||||
{"GGML_UNARY_OP_SILU", op::translate_unary_silu},
|
||||
{"GGML_OP_VIEW", op::translate_view}};
|
||||
}
|
||||
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
|
|
|
|||
|
|
@ -6,6 +6,29 @@ namespace ov {
|
|||
namespace frontend {
|
||||
namespace ggml {
|
||||
|
||||
namespace op {
|
||||
|
||||
#define GGML_OP_CONVERTER(op) OutputVector op(const NodeContext& context)
|
||||
|
||||
GGML_OP_CONVERTER(translate_add);
|
||||
GGML_OP_CONVERTER(translate_cont);
|
||||
GGML_OP_CONVERTER(translate_cpy);
|
||||
GGML_OP_CONVERTER(translate_get_rows);
|
||||
GGML_OP_CONVERTER(translate_mul);
|
||||
GGML_OP_CONVERTER(translate_mulmat);
|
||||
GGML_OP_CONVERTER(translate_permute);
|
||||
GGML_OP_CONVERTER(translate_reshape);
|
||||
GGML_OP_CONVERTER(translate_rms_norm);
|
||||
GGML_OP_CONVERTER(translate_rope);
|
||||
GGML_OP_CONVERTER(translate_scale);
|
||||
GGML_OP_CONVERTER(translate_unary_silu);
|
||||
GGML_OP_CONVERTER(translate_soft_max);
|
||||
GGML_OP_CONVERTER(translate_transpose);
|
||||
GGML_OP_CONVERTER(translate_unary);
|
||||
GGML_OP_CONVERTER(translate_view);
|
||||
|
||||
} // namespace op
|
||||
|
||||
std::unordered_map<std::string, CreatorFunction> get_supported_ops();
|
||||
|
||||
} // namespace ggml
|
||||
|
|
|
|||
|
|
@ -8,7 +8,9 @@ namespace ov {
|
|||
namespace frontend {
|
||||
namespace ggml {
|
||||
|
||||
void dump_ov_model(const std::shared_ptr<ov::Model> model);
|
||||
std::string getCurrentTime();
|
||||
|
||||
void dump_ov_model(std::shared_ptr<ov::Model> model);
|
||||
|
||||
void num_inputs_check(const NodeContext& context, size_t min_inputs, size_t max_inputs);
|
||||
|
||||
|
|
@ -52,7 +54,8 @@ std::vector<T> permute(const std::vector<T>& x, const std::vector<int>& perm) {
|
|||
return result;
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Node> get_dimensions(const std::shared_ptr<op::v3::ShapeOf>& shape, const std::vector<int>& dims);
|
||||
std::shared_ptr<ov::Node> get_dimensions(const std::shared_ptr<ov::op::v3::ShapeOf>& shape,
|
||||
const std::vector<int>& dims);
|
||||
std::shared_ptr<ov::Node> get_dimensions(const std::shared_ptr<ov::Node>& node, const std::vector<int>& dims);
|
||||
|
||||
OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std::string& suffix);
|
||||
|
|
@ -61,7 +64,8 @@ namespace op {
|
|||
template <typename T>
|
||||
OutputVector translate_1to1_match_2_inputs(const NodeContext& context) {
|
||||
num_inputs_check(context, 2, 2);
|
||||
return {std::make_shared<T>(context.get_input(0), context.get_input(1))};
|
||||
auto res = std::make_shared<T>(context.get_input(0), context.get_input(1));
|
||||
return rename_outputs_with_suffix({ res }, context.get_name());
|
||||
}
|
||||
} // namespace op
|
||||
|
||||
|
|
|
|||
|
|
@ -27,13 +27,15 @@ std::shared_ptr<GgmlOvDecoder> get_ggml_decoder(struct ggml_cgraph* cgraph, bool
|
|||
return std::make_shared<GgmlOvDecoder>(nullptr, cgraph, is_static, is_first_token);
|
||||
}
|
||||
|
||||
ov::Tensor get_ggml_graph_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, std::string& name) {
|
||||
auto* input_data = ggml_decoder->get_input_ggml_tensor(name)->data;
|
||||
ov::Tensor input_tensor;
|
||||
ov::Shape input_shape = ggml_decoder->get_input_shape(name).to_shape();
|
||||
std::vector<size_t> input_stride = ggml_decoder->get_input_stride(name);
|
||||
input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data);
|
||||
return input_tensor;
|
||||
ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
|
||||
const std::string& name) {
|
||||
auto *input_data = ggml_decoder->get_input_ggml_tensor(name)->data;
|
||||
ov::Tensor input_tensor;
|
||||
ov::Shape input_shape = ggml_decoder->get_input_shape(name).to_shape();
|
||||
std::vector<size_t> input_stride = ggml_decoder->get_input_stride(name);
|
||||
input_tensor =
|
||||
ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data);
|
||||
return input_tensor;
|
||||
}
|
||||
|
||||
std::map<std::string, void*> get_ggml_graph_output_dst(std::shared_ptr<GgmlOvDecoder> ggml_decoder) {
|
||||
|
|
@ -59,30 +61,21 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
|
|||
|
||||
static std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : "";
|
||||
if (device.empty()) {
|
||||
// Prefer GPU over CPU
|
||||
for (const auto& dev : core.get_available_devices()) {
|
||||
device = dev;
|
||||
if (device == "GPU")
|
||||
break;
|
||||
const std::vector<std::string> preferred_device = {"GPU", "CPU", "NPU"};
|
||||
const auto available_devices = core.get_available_devices();
|
||||
for (const auto& dev : preferred_device) {
|
||||
if (std::find(available_devices.begin(), available_devices.end(),
|
||||
dev) != available_devices.end()) {
|
||||
device = dev;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool is_static = device == "NPU" ? true : false;
|
||||
ov::AnyMap config;
|
||||
if (device == "NPU") {
|
||||
config = {
|
||||
{ "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=ReduceMean" },
|
||||
{ "NPU_USE_NPUW", "YES" },
|
||||
{ "NPUW_DEVICES", "NPU" },
|
||||
{ "NPUW_FOLD", "YES" },
|
||||
{ "NPUW_HOST_GATHER", "YES" },
|
||||
{ "NPUW_DQ", "YES" },
|
||||
{ "NPUW_FUNCALL_ASYNC", "YES" },
|
||||
{ "NPUW_WEIGHTS_BANK", "shared" },
|
||||
// Option 'CACHE_DIR' is not supported with MLIR compiler type
|
||||
// {"NPUW_CACHE_DIR", getenv("GGML_OPENVINO_CACHE_DIR") ? getenv("GGML_OPENVINO_CACHE_DIR") : ""},
|
||||
{ "NPU_COMPILER_TYPE", "MLIR" },
|
||||
};
|
||||
config = get_npu_config();
|
||||
}
|
||||
|
||||
auto start_time = ggml_time_us();
|
||||
|
|
@ -179,48 +172,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
|
|||
auto ov_params = model->get_parameters();
|
||||
for (size_t i = 0; i < ov_params.size(); i++) {
|
||||
auto param_name = ov_params[i]->get_friendly_name();
|
||||
ov::Tensor input_tensor;
|
||||
|
||||
if (ggml_decoder->get_model_extra_inputs().find(param_name) != ggml_decoder->get_model_extra_inputs().end()) {
|
||||
input_tensor = *ggml_decoder->get_model_extra_input_values().at(param_name);
|
||||
|
||||
} else if (!is_static) {
|
||||
input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name);
|
||||
|
||||
} else {
|
||||
if (param_name == "inp_tokens" || param_name == "inp_pos") {
|
||||
if (is_first_token) {
|
||||
size_t max_token_len = ggml_decoder->get_max_token_len();
|
||||
const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name);
|
||||
std::vector<int32_t> padded_data = pad_input<int32_t>(input_tensor_ggml, 1, max_token_len, 0);
|
||||
input_tensor = ov::Tensor(ov::element::i32, ov::Shape{1, 1, max_token_len});
|
||||
auto* data_ptr = input_tensor.data<int32_t>();
|
||||
std::copy(padded_data.begin(), padded_data.end(), data_ptr);
|
||||
} else {
|
||||
input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name);
|
||||
}
|
||||
|
||||
} else if (param_name == "KQ_mask") {
|
||||
size_t max_token_len = ggml_decoder->get_max_token_len();
|
||||
const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name);
|
||||
if (is_first_token) {
|
||||
std::vector<float> padded_data =
|
||||
pad_input<float>(input_tensor_ggml, max_token_len, max_token_len, -INFINITY);
|
||||
set_zero_diagonal(padded_data, max_token_len);
|
||||
input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, max_token_len, max_token_len});
|
||||
auto* data_ptr = input_tensor.data<float>();
|
||||
std::copy(padded_data.begin(), padded_data.end(), data_ptr);
|
||||
} else {
|
||||
std::vector<float> padded_data = pad_input<float>(input_tensor_ggml, 1, max_token_len, -INFINITY);
|
||||
input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 1, max_token_len});
|
||||
auto* data_ptr = input_tensor.data<float>();
|
||||
std::copy(padded_data.begin(), padded_data.end(), data_ptr);
|
||||
}
|
||||
|
||||
} else {
|
||||
input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name);
|
||||
}
|
||||
}
|
||||
auto input_tensor = get_ov_input_tensor(ggml_decoder, param_name);
|
||||
infer_request.set_input_tensor(i, input_tensor);
|
||||
|
||||
if (getenv("GGML_OPENVINO_DEBUG_INPUT")) {
|
||||
|
|
@ -258,6 +210,80 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
|
|||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
ov::AnyMap get_npu_config() {
|
||||
ov::AnyMap config = {
|
||||
{ "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=ReduceMean" },
|
||||
{ "NPU_USE_NPUW", "YES" },
|
||||
{ "NPUW_DEVICES", "NPU" },
|
||||
{ "NPUW_FOLD", "YES" },
|
||||
{ "NPUW_HOST_GATHER", "YES" },
|
||||
{ "NPUW_DQ", "YES" },
|
||||
{ "NPUW_FUNCALL_ASYNC", "YES" },
|
||||
{ "NPUW_WEIGHTS_BANK", "shared" },
|
||||
// Option 'CACHE_DIR' is not supported with MLIR compiler type
|
||||
// {"NPUW_CACHE_DIR", getenv("GGML_OPENVINO_CACHE_DIR") ? getenv("GGML_OPENVINO_CACHE_DIR") : ""},
|
||||
{ "NPU_COMPILER_TYPE", "MLIR" },
|
||||
};
|
||||
return config;
|
||||
}
|
||||
|
||||
ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
|
||||
const std::string& param_name) {
|
||||
bool is_static = ggml_decoder->is_static();
|
||||
bool is_first_token = ggml_decoder->is_first_token();
|
||||
|
||||
ov::Tensor input_tensor;
|
||||
if (ggml_decoder->get_model_extra_inputs().find(param_name) !=
|
||||
ggml_decoder->get_model_extra_inputs().end()) {
|
||||
input_tensor = *ggml_decoder->get_model_extra_input_values().at(param_name);
|
||||
|
||||
} else if (!is_static) {
|
||||
input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name);
|
||||
|
||||
} else {
|
||||
if (param_name == "inp_tokens" || param_name == "inp_pos") {
|
||||
if (is_first_token) {
|
||||
size_t max_token_len = ggml_decoder->get_max_token_len();
|
||||
const auto *input_tensor_ggml =
|
||||
ggml_decoder->get_input_ggml_tensor(param_name);
|
||||
std::vector<int32_t> padded_data =
|
||||
pad_input<int32_t>(input_tensor_ggml, 1, max_token_len, 0);
|
||||
input_tensor =
|
||||
ov::Tensor(ov::element::i32, ov::Shape{1, 1, max_token_len});
|
||||
auto *data_ptr = input_tensor.data<int32_t>();
|
||||
std::copy(padded_data.begin(), padded_data.end(), data_ptr);
|
||||
} else {
|
||||
input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name);
|
||||
}
|
||||
|
||||
} else if (param_name == "KQ_mask") {
|
||||
size_t max_token_len = ggml_decoder->get_max_token_len();
|
||||
const auto *input_tensor_ggml =
|
||||
ggml_decoder->get_input_ggml_tensor(param_name);
|
||||
if (is_first_token) {
|
||||
std::vector<float> padded_data = pad_input<float>(
|
||||
input_tensor_ggml, max_token_len, max_token_len, -INFINITY);
|
||||
set_zero_diagonal(padded_data, max_token_len);
|
||||
input_tensor = ov::Tensor(ov::element::f32,
|
||||
ov::Shape{1, max_token_len, max_token_len});
|
||||
auto *data_ptr = input_tensor.data<float>();
|
||||
std::copy(padded_data.begin(), padded_data.end(), data_ptr);
|
||||
} else {
|
||||
std::vector<float> padded_data =
|
||||
pad_input<float>(input_tensor_ggml, 1, max_token_len, -INFINITY);
|
||||
input_tensor =
|
||||
ov::Tensor(ov::element::f32, ov::Shape{1, 1, max_token_len});
|
||||
auto *data_ptr = input_tensor.data<float>();
|
||||
std::copy(padded_data.begin(), padded_data.end(), data_ptr);
|
||||
}
|
||||
|
||||
} else {
|
||||
input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name);
|
||||
}
|
||||
}
|
||||
return input_tensor;
|
||||
}
|
||||
|
||||
size_t checksum(const void* data, size_t size) {
|
||||
const uint8_t* bytes = static_cast<const uint8_t*>(data);
|
||||
size_t sum = 0;
|
||||
|
|
@ -268,22 +294,27 @@ size_t checksum(const void* data, size_t size) {
|
|||
return sum;
|
||||
}
|
||||
|
||||
// Suppress deprecation warning for ov::Tensor::data()
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
|
||||
|
||||
void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor) {
|
||||
std::cout << "Input name: " << name << ", Input shape: " << tensor.get_shape() << ", Address: " << tensor.data()
|
||||
<< std::endl;
|
||||
switch (tensor.get_element_type()) {
|
||||
case ov::element::f32:
|
||||
std::cout << *(float*)(tensor.data()) << std::endl;
|
||||
break;
|
||||
std::cout << *(tensor.data<float>()) << std::endl;
|
||||
break;
|
||||
case ov::element::f16:
|
||||
std::cout << ov::float16::from_bits(*(uint16_t*)(tensor.data())) << std::endl;
|
||||
break;
|
||||
std::cout << ov::float16::from_bits(*(tensor.data<uint16_t>()))
|
||||
<< std::endl;
|
||||
break;
|
||||
case ov::element::i32:
|
||||
std::cout << *(int32_t*)(tensor.data()) << std::endl;
|
||||
break;
|
||||
std::cout << *(tensor.data<int32_t>()) << std::endl;
|
||||
break;
|
||||
case ov::element::i64:
|
||||
std::cout << *(int64_t*)(tensor.data()) << std::endl;
|
||||
break;
|
||||
std::cout << *(tensor.data<int64_t>()) << std::endl;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
|
@ -296,18 +327,21 @@ void print_output_tensor_info(const std::string& name,
|
|||
<< ", Address: " << output_dst[name] << std::endl;
|
||||
switch (tensor.get_element_type()) {
|
||||
case ov::element::f32:
|
||||
std::cout << *(float*)(tensor.data()) << std::endl;
|
||||
std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl;
|
||||
break;
|
||||
std::cout << *(tensor.data<float>()) << std::endl;
|
||||
std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl;
|
||||
break;
|
||||
case ov::element::f16:
|
||||
std::cout << ov::float16::from_bits(*(uint16_t*)(tensor.data())) << std::endl;
|
||||
std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl;
|
||||
break;
|
||||
std::cout << ov::float16::from_bits(*(tensor.data<uint16_t>()))
|
||||
<< std::endl;
|
||||
std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
void set_zero_diagonal(std::vector<float>& matrix, size_t dim) {
|
||||
for (size_t i = 0; i < dim; ++i) {
|
||||
matrix[i * dim + i] = 0.0f;
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
|
|||
|
||||
std::shared_ptr<GgmlOvDecoder> get_ggml_decoder(struct ggml_cgraph* cgraph, bool is_static, bool is_first_token);
|
||||
|
||||
ov::Tensor get_ggml_graph_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, std::string& name);
|
||||
ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string& name);
|
||||
|
||||
std::map<std::string, void*> get_ggml_graph_output_dst(std::shared_ptr<GgmlOvDecoder> ggml_decoder);
|
||||
|
||||
|
|
@ -38,3 +38,7 @@ std::vector<T> pad_input(const ggml_tensor* tensor, size_t padded_rows, size_t p
|
|||
void set_zero_diagonal(std::vector<float>& matrix, size_t dim);
|
||||
|
||||
bool is_prefill(struct ggml_cgraph * cgraph);
|
||||
|
||||
ov::AnyMap get_npu_config();
|
||||
|
||||
ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string& param_name);
|
||||
|
|
|
|||
Loading…
Reference in New Issue