diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 9e6c678e83..d09771d104 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -140,7 +140,7 @@ int main(int argc, char ** argv) { std::string s(buf, n); printf("%s", s.c_str()); } - printf("\n"); + // prepare a batch for the prompt llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size()); diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 2fa05ab90c..4c2d79a723 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -247,8 +247,6 @@ set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING "ggml: sycl device architecture") option(GGML_OPENVINO "ggml: use OPENVINO" OFF) -option(GGML_OPENVINO_DEBUG "ggml: enable OPENVINO debugging" OFF) -option(GGML_OV_FRONTEND "ggml: OPENVINO frontend path" ON) option(GGML_OPENCL "ggml: use OpenCL" OFF) option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increases overhead)" OFF) diff --git a/ggml/src/ggml-openvino/.clang-format b/ggml/src/ggml-openvino/.clang-format new file mode 100644 index 0000000000..8491f4e5c6 --- /dev/null +++ b/ggml/src/ggml-openvino/.clang-format @@ -0,0 +1,4 @@ +--- +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +ReferenceAlignment: Left diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 7b62f4487c..04f68a4950 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -225,9 +225,9 @@ void GgmlOvDecoder::set_max_token_len() { } void GgmlOvDecoder::add_extra_inputs() { - int64_t past_token_len; + int64_t past_token_len = -1; // attention_size not used for NPU - int64_t attention_size; + int64_t attention_size = -1; for (const auto& node : m_nodes) { if (node->op == GGML_OP_CPY && ggml_is_contiguous(node)) { @@ -247,6 +247,9 @@ void GgmlOvDecoder::add_extra_inputs() { break; } } + if (past_token_len == -1) { + throw std::runtime_error("Failed to find input \"cache_k\" in the graph"); + } for (const auto& node : m_nodes) { if (node->src[1] && std::string(node->src[1]->name).find("inp_tokens") == 0) { int64_t total_token_len = node->src[1]->ne[0] + past_token_len; diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 2c89d06267..b6b13d1f11 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -61,11 +61,11 @@ public: virtual void visit_subgraph(std::function)> node_visitor) const override; - const ggml_tensor* get_input_ggml_tensor(std::string& name) const { + const ggml_tensor* get_input_ggml_tensor(const std::string& name) const { return m_inputs.at(name); } - const ggml_tensor* get_output_ggml_tensor(std::string& name) const { + const ggml_tensor* get_output_ggml_tensor(const std::string& name) const { return m_outputs.at(name); } diff --git a/ggml/src/ggml-openvino/openvino/op/add.cpp b/ggml/src/ggml-openvino/openvino/op/add.cpp deleted file mode 100644 index 5a75ff2148..0000000000 --- a/ggml/src/ggml-openvino/openvino/op/add.cpp +++ /dev/null @@ -1,22 +0,0 @@ -#include - -#include "../node_context.hpp" -#include "../utils.hpp" - -namespace ov { -namespace frontend { -namespace ggml { -namespace op { - -OutputVector translate_add(const NodeContext& context) { - num_inputs_check(context, 2, 2); - - auto res = std::make_shared(context.get_input(0), context.get_input(1)); - - return rename_outputs_with_suffix({res}, context.get_name()); -} - -} // namespace op -} // namespace ggml -} // namespace frontend -} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/cont.cpp b/ggml/src/ggml-openvino/openvino/op/cont.cpp index 7cdfba051e..5c6953caff 100644 --- a/ggml/src/ggml-openvino/openvino/op/cont.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cont.cpp @@ -7,6 +7,7 @@ #include #include "../node_context.hpp" +#include "../op_table.hpp" #include "../utils.hpp" namespace ov { diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp index 4973645024..d27f4babb4 100644 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -19,6 +19,7 @@ #include #include "../node_context.hpp" +#include "../op_table.hpp" #include "../utils.hpp" namespace ov { diff --git a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp index ca36548d9f..9ed5f4deaf 100644 --- a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp @@ -7,6 +7,7 @@ #include #include "../node_context.hpp" +#include "../op_table.hpp" #include "../utils.hpp" namespace ov { diff --git a/ggml/src/ggml-openvino/openvino/op/mul.cpp b/ggml/src/ggml-openvino/openvino/op/mul.cpp deleted file mode 100644 index 40caf4331e..0000000000 --- a/ggml/src/ggml-openvino/openvino/op/mul.cpp +++ /dev/null @@ -1,21 +0,0 @@ -#include - -#include "../node_context.hpp" -#include "../utils.hpp" - -namespace ov { -namespace frontend { -namespace ggml { -namespace op { - -OutputVector translate_mul(const NodeContext& context) { - num_inputs_check(context, 2, 2); - - auto res = std::make_shared(context.get_input(0), context.get_input(1)); - return rename_outputs_with_suffix({res}, context.get_name()); -} - -} // namespace op -} // namespace ggml -} // namespace frontend -} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index b94f327a1f..d5a6ba2f03 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -13,6 +13,7 @@ #include #include "../node_context.hpp" +#include "../op_table.hpp" #include "../utils.hpp" namespace ov { diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index 8b246f75cd..09d15da427 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -9,6 +9,7 @@ #include #include "../node_context.hpp" +#include "../op_table.hpp" #include "../utils.hpp" namespace ov { @@ -25,9 +26,8 @@ OutputVector translate_permute(const NodeContext& context) { if (op_case == 1) { auto perm = argsort_descend(context.get_output_stride(0)); - auto res = std::make_shared(context.get_input(0), - ov::op::v0::Constant::create(ov::element::i64, {3}, perm)); - return rename_outputs_with_suffix({res}, context.get_name()); + res = std::make_shared(context.get_input(0), + ov::op::v0::Constant::create(ov::element::i64, { 3 }, perm)); } else { auto src = context.get_input(0); auto attention_size = context.get_input("attention_size"); @@ -70,8 +70,8 @@ OutputVector translate_permute(const NodeContext& context) { } else { res = src_slice; } - return rename_outputs_with_suffix({res}, context.get_name()); } + return rename_outputs_with_suffix({ res }, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/reshape.cpp b/ggml/src/ggml-openvino/openvino/op/reshape.cpp index 49551eb815..3a695683bf 100644 --- a/ggml/src/ggml-openvino/openvino/op/reshape.cpp +++ b/ggml/src/ggml-openvino/openvino/op/reshape.cpp @@ -8,6 +8,7 @@ #include #include "../node_context.hpp" +#include "../op_table.hpp" #include "../utils.hpp" namespace ov { diff --git a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp index 4b230ad630..211692a3c7 100644 --- a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp @@ -7,6 +7,7 @@ #include #include "../node_context.hpp" +#include "../op_table.hpp" #include "../utils.hpp" namespace ov { diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index b47b8a6a54..78523e5781 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -20,6 +20,7 @@ #include #include "../node_context.hpp" +#include "../op_table.hpp" #include "../utils.hpp" #ifndef M_PI @@ -36,21 +37,19 @@ namespace frontend { namespace ggml { namespace op { -static float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) { - return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float)M_PI)) / (2 * logf(base)); +namespace { +float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) { + return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float) M_PI)) / (2 * logf(base)); } -void ggml_rope_yarn_corr_dims(int n_dims, - int n_ctx_orig, - float freq_base, - float beta_fast, - float beta_slow, +void ggml_rope_yarn_corr_dims(int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]) { float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base)); float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base)); dims[0] = MAX(0, start); dims[1] = MIN(n_dims - 1, end); } +} // namespace OutputVector translate_rope(const NodeContext& context) { num_inputs_check(context, 2, 3); @@ -67,7 +66,12 @@ OutputVector translate_rope(const NodeContext& context) { auto output_shape = context.get_output_shape(0); - float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; + float freq_base; + float freq_scale; + float ext_factor; + float attn_factor; + float beta_fast; + float beta_slow; int32_t* op_params = context.get_output_op_params(0); const int n_dims = op_params[1]; const int mode = op_params[2]; diff --git a/ggml/src/ggml-openvino/openvino/op/scale.cpp b/ggml/src/ggml-openvino/openvino/op/scale.cpp index 8f0999432c..783440ebd9 100644 --- a/ggml/src/ggml-openvino/openvino/op/scale.cpp +++ b/ggml/src/ggml-openvino/openvino/op/scale.cpp @@ -3,6 +3,7 @@ #include #include "../node_context.hpp" +#include "../op_table.hpp" #include "../utils.hpp" namespace ov { diff --git a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp index bb6b002395..aeca9b3be5 100644 --- a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp +++ b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp @@ -1,5 +1,3 @@ - -#include #include #include #include @@ -13,6 +11,7 @@ #include #include "../node_context.hpp" +#include "../op_table.hpp" #include "../utils.hpp" namespace ov { @@ -28,18 +27,18 @@ OutputVector translate_soft_max(const NodeContext& context) { float scale = 1.0f; float max_bias = 0.0f; - auto op_params = context.get_output_op_params(0); + auto * op_params = context.get_output_op_params(0); memcpy(&scale, (float*)op_params + 0, sizeof(float)); memcpy(&max_bias, (float*)op_params + 1, sizeof(float)); - const uint32_t n_head = context.get_input_shape(0)[0].get_length(); - const uint32_t n_head_log2 = 1u << (uint32_t)floor(log2(n_head)); + // const uint32_t n_head = context.get_input_shape(0)[0].get_length(); + // const uint32_t n_head_log2 = 1u << (uint32_t)floor(log2(n_head)); // const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); // const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - const float slope = (max_bias > 0.0f) ? 1.0f : 1.0f; // const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) // : 1.0f; + const float slope = 1.0; if (scale != 1.0f) { auto scale_node = diff --git a/ggml/src/ggml-openvino/openvino/op/transpose.cpp b/ggml/src/ggml-openvino/openvino/op/transpose.cpp index 99178a1944..b35f1fb861 100644 --- a/ggml/src/ggml-openvino/openvino/op/transpose.cpp +++ b/ggml/src/ggml-openvino/openvino/op/transpose.cpp @@ -1,6 +1,7 @@ #include #include "../node_context.hpp" +#include "../op_table.hpp" #include "../utils.hpp" namespace ov { diff --git a/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp b/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp index 6c73653ca4..2b27c0be12 100644 --- a/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp @@ -3,6 +3,7 @@ #include #include "../node_context.hpp" +#include "../op_table.hpp" #include "../utils.hpp" namespace ov { diff --git a/ggml/src/ggml-openvino/openvino/op/view.cpp b/ggml/src/ggml-openvino/openvino/op/view.cpp index fcfb9f732c..58143e667c 100644 --- a/ggml/src/ggml-openvino/openvino/op/view.cpp +++ b/ggml/src/ggml-openvino/openvino/op/view.cpp @@ -1,3 +1,4 @@ +#include "../op_table.hpp" #include "../utils.hpp" namespace ov { diff --git a/ggml/src/ggml-openvino/openvino/op_table.cpp b/ggml/src/ggml-openvino/openvino/op_table.cpp index d588b2bff0..11d1c773c3 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.cpp +++ b/ggml/src/ggml-openvino/openvino/op_table.cpp @@ -9,55 +9,31 @@ #include "utils.hpp" -using namespace ov::op; namespace ov { namespace frontend { namespace ggml { -namespace op { - -#define GGML_OP_CONVERTER(op) OutputVector op(const NodeContext& node) - -GGML_OP_CONVERTER(translate_add); -GGML_OP_CONVERTER(translate_cont); -GGML_OP_CONVERTER(translate_cpy); -GGML_OP_CONVERTER(translate_get_rows); -GGML_OP_CONVERTER(translate_mul); -GGML_OP_CONVERTER(translate_mulmat); -GGML_OP_CONVERTER(translate_permute); -GGML_OP_CONVERTER(translate_reshape); -GGML_OP_CONVERTER(translate_rms_norm); -GGML_OP_CONVERTER(translate_rope); -GGML_OP_CONVERTER(translate_scale); -GGML_OP_CONVERTER(translate_unary_silu); -GGML_OP_CONVERTER(translate_soft_max); -GGML_OP_CONVERTER(translate_transpose); -GGML_OP_CONVERTER(translate_unary); -GGML_OP_CONVERTER(translate_view); - -} // namespace op - std::unordered_map get_supported_ops() { - return {{"GGML_OP_ADD", op::translate_1to1_match_2_inputs}, - {"GGML_OP_ADD1", op::translate_1to1_match_2_inputs}, - {"GGML_OP_CONT", op::translate_cont}, - {"GGML_OP_CPY", op::translate_cpy}, - {"GGML_OP_DIV", op::translate_1to1_match_2_inputs}, - {"GGML_OP_GET_ROWS", op::translate_get_rows}, - // {"GGML_OP_MUL", op::translate_1to1_match_2_inputs}, - {"GGML_OP_MUL", op::translate_mul}, - {"GGML_OP_MUL_MAT", op::translate_mulmat}, - {"GGML_OP_PERMUTE", op::translate_permute}, - {"GGML_OP_RESHAPE", op::translate_reshape}, - {"GGML_OP_RMS_NORM", op::translate_rms_norm}, - {"GGML_OP_ROPE", op::translate_rope}, - {"GGML_OP_SCALE", op::translate_scale}, - {"GGML_OP_SOFT_MAX", op::translate_soft_max}, - {"GGML_OP_SUB", op::translate_1to1_match_2_inputs}, - {"GGML_OP_TRANSPOSE", op::translate_transpose}, - {"GGML_UNARY_OP_SILU", op::translate_unary_silu}, - {"GGML_OP_VIEW", op::translate_view}}; -}; + using namespace ov::op; + return {{"GGML_OP_ADD", op::translate_1to1_match_2_inputs}, + {"GGML_OP_ADD1", op::translate_1to1_match_2_inputs}, + {"GGML_OP_CONT", op::translate_cont}, + {"GGML_OP_CPY", op::translate_cpy}, + {"GGML_OP_DIV", op::translate_1to1_match_2_inputs}, + {"GGML_OP_GET_ROWS", op::translate_get_rows}, + {"GGML_OP_MUL", op::translate_1to1_match_2_inputs}, + {"GGML_OP_MUL_MAT", op::translate_mulmat}, + {"GGML_OP_PERMUTE", op::translate_permute}, + {"GGML_OP_RESHAPE", op::translate_reshape}, + {"GGML_OP_RMS_NORM", op::translate_rms_norm}, + {"GGML_OP_ROPE", op::translate_rope}, + {"GGML_OP_SCALE", op::translate_scale}, + {"GGML_OP_SOFT_MAX", op::translate_soft_max}, + {"GGML_OP_SUB", op::translate_1to1_match_2_inputs}, + {"GGML_OP_TRANSPOSE", op::translate_transpose}, + {"GGML_UNARY_OP_SILU", op::translate_unary_silu}, + {"GGML_OP_VIEW", op::translate_view}}; +} } // namespace ggml } // namespace frontend diff --git a/ggml/src/ggml-openvino/openvino/op_table.hpp b/ggml/src/ggml-openvino/openvino/op_table.hpp index 1a71a06c18..d576c2a135 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.hpp +++ b/ggml/src/ggml-openvino/openvino/op_table.hpp @@ -6,6 +6,29 @@ namespace ov { namespace frontend { namespace ggml { +namespace op { + +#define GGML_OP_CONVERTER(op) OutputVector op(const NodeContext& context) + +GGML_OP_CONVERTER(translate_add); +GGML_OP_CONVERTER(translate_cont); +GGML_OP_CONVERTER(translate_cpy); +GGML_OP_CONVERTER(translate_get_rows); +GGML_OP_CONVERTER(translate_mul); +GGML_OP_CONVERTER(translate_mulmat); +GGML_OP_CONVERTER(translate_permute); +GGML_OP_CONVERTER(translate_reshape); +GGML_OP_CONVERTER(translate_rms_norm); +GGML_OP_CONVERTER(translate_rope); +GGML_OP_CONVERTER(translate_scale); +GGML_OP_CONVERTER(translate_unary_silu); +GGML_OP_CONVERTER(translate_soft_max); +GGML_OP_CONVERTER(translate_transpose); +GGML_OP_CONVERTER(translate_unary); +GGML_OP_CONVERTER(translate_view); + +} // namespace op + std::unordered_map get_supported_ops(); } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/utils.hpp b/ggml/src/ggml-openvino/openvino/utils.hpp index e0fe250789..1896f81427 100644 --- a/ggml/src/ggml-openvino/openvino/utils.hpp +++ b/ggml/src/ggml-openvino/openvino/utils.hpp @@ -8,7 +8,9 @@ namespace ov { namespace frontend { namespace ggml { -void dump_ov_model(const std::shared_ptr model); +std::string getCurrentTime(); + +void dump_ov_model(std::shared_ptr model); void num_inputs_check(const NodeContext& context, size_t min_inputs, size_t max_inputs); @@ -52,7 +54,8 @@ std::vector permute(const std::vector& x, const std::vector& perm) { return result; } -std::shared_ptr get_dimensions(const std::shared_ptr& shape, const std::vector& dims); +std::shared_ptr get_dimensions(const std::shared_ptr& shape, + const std::vector& dims); std::shared_ptr get_dimensions(const std::shared_ptr& node, const std::vector& dims); OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std::string& suffix); @@ -61,7 +64,8 @@ namespace op { template OutputVector translate_1to1_match_2_inputs(const NodeContext& context) { num_inputs_check(context, 2, 2); - return {std::make_shared(context.get_input(0), context.get_input(1))}; + auto res = std::make_shared(context.get_input(0), context.get_input(1)); + return rename_outputs_with_suffix({ res }, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 44356209ce..ebcf8fdd75 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -27,13 +27,15 @@ std::shared_ptr get_ggml_decoder(struct ggml_cgraph* cgraph, bool return std::make_shared(nullptr, cgraph, is_static, is_first_token); } -ov::Tensor get_ggml_graph_input_tensor(std::shared_ptr ggml_decoder, std::string& name) { - auto* input_data = ggml_decoder->get_input_ggml_tensor(name)->data; - ov::Tensor input_tensor; - ov::Shape input_shape = ggml_decoder->get_input_shape(name).to_shape(); - std::vector input_stride = ggml_decoder->get_input_stride(name); - input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); - return input_tensor; +ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, + const std::string& name) { + auto *input_data = ggml_decoder->get_input_ggml_tensor(name)->data; + ov::Tensor input_tensor; + ov::Shape input_shape = ggml_decoder->get_input_shape(name).to_shape(); + std::vector input_stride = ggml_decoder->get_input_stride(name); + input_tensor = + ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); + return input_tensor; } std::map get_ggml_graph_output_dst(std::shared_ptr ggml_decoder) { @@ -59,30 +61,21 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c static std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : ""; if (device.empty()) { - // Prefer GPU over CPU - for (const auto& dev : core.get_available_devices()) { - device = dev; - if (device == "GPU") - break; + const std::vector preferred_device = {"GPU", "CPU", "NPU"}; + const auto available_devices = core.get_available_devices(); + for (const auto& dev : preferred_device) { + if (std::find(available_devices.begin(), available_devices.end(), + dev) != available_devices.end()) { + device = dev; + break; } + } } bool is_static = device == "NPU" ? true : false; ov::AnyMap config; if (device == "NPU") { - config = { - { "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=ReduceMean" }, - { "NPU_USE_NPUW", "YES" }, - { "NPUW_DEVICES", "NPU" }, - { "NPUW_FOLD", "YES" }, - { "NPUW_HOST_GATHER", "YES" }, - { "NPUW_DQ", "YES" }, - { "NPUW_FUNCALL_ASYNC", "YES" }, - { "NPUW_WEIGHTS_BANK", "shared" }, - // Option 'CACHE_DIR' is not supported with MLIR compiler type - // {"NPUW_CACHE_DIR", getenv("GGML_OPENVINO_CACHE_DIR") ? getenv("GGML_OPENVINO_CACHE_DIR") : ""}, - { "NPU_COMPILER_TYPE", "MLIR" }, - }; + config = get_npu_config(); } auto start_time = ggml_time_us(); @@ -179,48 +172,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c auto ov_params = model->get_parameters(); for (size_t i = 0; i < ov_params.size(); i++) { auto param_name = ov_params[i]->get_friendly_name(); - ov::Tensor input_tensor; - - if (ggml_decoder->get_model_extra_inputs().find(param_name) != ggml_decoder->get_model_extra_inputs().end()) { - input_tensor = *ggml_decoder->get_model_extra_input_values().at(param_name); - - } else if (!is_static) { - input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name); - - } else { - if (param_name == "inp_tokens" || param_name == "inp_pos") { - if (is_first_token) { - size_t max_token_len = ggml_decoder->get_max_token_len(); - const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); - std::vector padded_data = pad_input(input_tensor_ggml, 1, max_token_len, 0); - input_tensor = ov::Tensor(ov::element::i32, ov::Shape{1, 1, max_token_len}); - auto* data_ptr = input_tensor.data(); - std::copy(padded_data.begin(), padded_data.end(), data_ptr); - } else { - input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name); - } - - } else if (param_name == "KQ_mask") { - size_t max_token_len = ggml_decoder->get_max_token_len(); - const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); - if (is_first_token) { - std::vector padded_data = - pad_input(input_tensor_ggml, max_token_len, max_token_len, -INFINITY); - set_zero_diagonal(padded_data, max_token_len); - input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, max_token_len, max_token_len}); - auto* data_ptr = input_tensor.data(); - std::copy(padded_data.begin(), padded_data.end(), data_ptr); - } else { - std::vector padded_data = pad_input(input_tensor_ggml, 1, max_token_len, -INFINITY); - input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 1, max_token_len}); - auto* data_ptr = input_tensor.data(); - std::copy(padded_data.begin(), padded_data.end(), data_ptr); - } - - } else { - input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name); - } - } + auto input_tensor = get_ov_input_tensor(ggml_decoder, param_name); infer_request.set_input_tensor(i, input_tensor); if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { @@ -258,6 +210,80 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c GGML_UNUSED(backend); } +ov::AnyMap get_npu_config() { + ov::AnyMap config = { + { "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=ReduceMean" }, + { "NPU_USE_NPUW", "YES" }, + { "NPUW_DEVICES", "NPU" }, + { "NPUW_FOLD", "YES" }, + { "NPUW_HOST_GATHER", "YES" }, + { "NPUW_DQ", "YES" }, + { "NPUW_FUNCALL_ASYNC", "YES" }, + { "NPUW_WEIGHTS_BANK", "shared" }, + // Option 'CACHE_DIR' is not supported with MLIR compiler type + // {"NPUW_CACHE_DIR", getenv("GGML_OPENVINO_CACHE_DIR") ? getenv("GGML_OPENVINO_CACHE_DIR") : ""}, + { "NPU_COMPILER_TYPE", "MLIR" }, + }; + return config; +} + +ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, + const std::string& param_name) { + bool is_static = ggml_decoder->is_static(); + bool is_first_token = ggml_decoder->is_first_token(); + + ov::Tensor input_tensor; + if (ggml_decoder->get_model_extra_inputs().find(param_name) != + ggml_decoder->get_model_extra_inputs().end()) { + input_tensor = *ggml_decoder->get_model_extra_input_values().at(param_name); + + } else if (!is_static) { + input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); + + } else { + if (param_name == "inp_tokens" || param_name == "inp_pos") { + if (is_first_token) { + size_t max_token_len = ggml_decoder->get_max_token_len(); + const auto *input_tensor_ggml = + ggml_decoder->get_input_ggml_tensor(param_name); + std::vector padded_data = + pad_input(input_tensor_ggml, 1, max_token_len, 0); + input_tensor = + ov::Tensor(ov::element::i32, ov::Shape{1, 1, max_token_len}); + auto *data_ptr = input_tensor.data(); + std::copy(padded_data.begin(), padded_data.end(), data_ptr); + } else { + input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); + } + + } else if (param_name == "KQ_mask") { + size_t max_token_len = ggml_decoder->get_max_token_len(); + const auto *input_tensor_ggml = + ggml_decoder->get_input_ggml_tensor(param_name); + if (is_first_token) { + std::vector padded_data = pad_input( + input_tensor_ggml, max_token_len, max_token_len, -INFINITY); + set_zero_diagonal(padded_data, max_token_len); + input_tensor = ov::Tensor(ov::element::f32, + ov::Shape{1, max_token_len, max_token_len}); + auto *data_ptr = input_tensor.data(); + std::copy(padded_data.begin(), padded_data.end(), data_ptr); + } else { + std::vector padded_data = + pad_input(input_tensor_ggml, 1, max_token_len, -INFINITY); + input_tensor = + ov::Tensor(ov::element::f32, ov::Shape{1, 1, max_token_len}); + auto *data_ptr = input_tensor.data(); + std::copy(padded_data.begin(), padded_data.end(), data_ptr); + } + + } else { + input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); + } + } + return input_tensor; +} + size_t checksum(const void* data, size_t size) { const uint8_t* bytes = static_cast(data); size_t sum = 0; @@ -268,22 +294,27 @@ size_t checksum(const void* data, size_t size) { return sum; } +// Suppress deprecation warning for ov::Tensor::data() +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor) { std::cout << "Input name: " << name << ", Input shape: " << tensor.get_shape() << ", Address: " << tensor.data() << std::endl; switch (tensor.get_element_type()) { case ov::element::f32: - std::cout << *(float*)(tensor.data()) << std::endl; - break; + std::cout << *(tensor.data()) << std::endl; + break; case ov::element::f16: - std::cout << ov::float16::from_bits(*(uint16_t*)(tensor.data())) << std::endl; - break; + std::cout << ov::float16::from_bits(*(tensor.data())) + << std::endl; + break; case ov::element::i32: - std::cout << *(int32_t*)(tensor.data()) << std::endl; - break; + std::cout << *(tensor.data()) << std::endl; + break; case ov::element::i64: - std::cout << *(int64_t*)(tensor.data()) << std::endl; - break; + std::cout << *(tensor.data()) << std::endl; + break; default: break; } @@ -296,18 +327,21 @@ void print_output_tensor_info(const std::string& name, << ", Address: " << output_dst[name] << std::endl; switch (tensor.get_element_type()) { case ov::element::f32: - std::cout << *(float*)(tensor.data()) << std::endl; - std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl; - break; + std::cout << *(tensor.data()) << std::endl; + std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl; + break; case ov::element::f16: - std::cout << ov::float16::from_bits(*(uint16_t*)(tensor.data())) << std::endl; - std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl; - break; + std::cout << ov::float16::from_bits(*(tensor.data())) + << std::endl; + std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl; + break; default: break; } } +#pragma GCC diagnostic pop + void set_zero_diagonal(std::vector& matrix, size_t dim) { for (size_t i = 0; i < dim; ++i) { matrix[i * dim + i] = 0.0f; diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 2427b0b1ce..1d23e28522 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -8,7 +8,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c std::shared_ptr get_ggml_decoder(struct ggml_cgraph* cgraph, bool is_static, bool is_first_token); -ov::Tensor get_ggml_graph_input_tensor(std::shared_ptr ggml_decoder, std::string& name); +ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, const std::string& name); std::map get_ggml_graph_output_dst(std::shared_ptr ggml_decoder); @@ -38,3 +38,7 @@ std::vector pad_input(const ggml_tensor* tensor, size_t padded_rows, size_t p void set_zero_diagonal(std::vector& matrix, size_t dim); bool is_prefill(struct ggml_cgraph * cgraph); + +ov::AnyMap get_npu_config(); + +ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string& param_name); diff --git a/setup.sh b/setup.sh deleted file mode 100755 index 697639dd14..0000000000 --- a/setup.sh +++ /dev/null @@ -1,2 +0,0 @@ -cmake --build build --parallel $(nproc) -