diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index f7846382b9..2f7ae333e7 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -76,6 +76,9 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph) { m_cgraph = cgraph; for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { auto* cur_node = cgraph->nodes[node_n]; + if (cur_node->op == GGML_OP_NONE) { + continue; + } m_nodes.push_back(cur_node); set_input_output(cur_node, true); } diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 7edd4667d9..8c700445b2 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -258,12 +258,25 @@ static bool is_op_unsupported_case(const ggml_tensor* op) { } } + if (op->op == GGML_OP_PERMUTE) { + if (op->type == GGML_TYPE_BF16) { + // err msg: [GPU] Could not find a suitable kernel for transpose + GGML_LOG_WARN("OpenVINO backend does not support PERMUTE with BF16 type\n"); + return true; + } + } + if (op->op == GGML_OP_MUL_MAT) { if ((op->src[0]->view_src && op->src[0]->op != GGML_OP_PERMUTE) || (op->src[1]->view_src && op->src[1]->op != GGML_OP_PERMUTE)) { GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with view_src tensors that are not PERMUTE\n"); return true; } + if (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16) { + // Has accuracy issue, try enabling this and see `test-backend-ops -o "MUL_MAT"` + GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with two F16 tensors\n"); + return true; + } } if (op->op == GGML_OP_ROPE) { diff --git a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp index 046cb93c8b..e072658ecb 100644 --- a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp +++ b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp @@ -53,7 +53,7 @@ OutputVector translate_soft_max(const NodeContext& context) { auto mask_node = context.get_input(1); - auto token_len = context.get_input("token_len"); + auto token_len = context.has_input("token_len") ? context.get_input("token_len") : get_dimensions(input_node, {1}); auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); std::shared_ptr mask_node_sliced = diff --git a/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.hpp b/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.hpp index 163422bf33..b40eaf4205 100644 --- a/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.hpp +++ b/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.hpp @@ -24,6 +24,6 @@ class TRANSFORMATIONS_API MarkCompressedFloatConstants; class ov::pass::MarkCompressedFloatConstants : public MatcherPass { public: - OPENVINO_MATCHER_PASS_RTTI("MarkCompressedFloatConstants"); + OPENVINO_MATCHER_PASS_RTTI("MarkCompressedFloatConstants") MarkCompressedFloatConstants(); }; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 45ed73499f..a64637f950 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -81,7 +81,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c config = get_npu_config(); } - if (cgraph->n_nodes == 1) { + if (is_naive(cgraph)) { return naive_compute(cgraph, core, device, config); } @@ -250,11 +250,16 @@ ov::AnyMap get_npu_config() { return config; } +bool is_naive(struct ggml_cgraph* cgraph) { + constexpr int naive_graph_size_threshold = 20; + return cgraph->n_nodes < naive_graph_size_threshold; +} + enum ggml_status naive_compute(struct ggml_cgraph* cgraph, ov::Core& core, const std::string& device, const ov::AnyMap& config) { - if (cgraph->nodes[0]->op == GGML_OP_NONE) { + if (cgraph->n_nodes == 1 && cgraph->nodes[0]->op == GGML_OP_NONE) { return GGML_STATUS_SUCCESS; } @@ -264,8 +269,6 @@ enum ggml_status naive_compute(struct ggml_cgraph* cgraph, auto model = ov::frontend::ggml::FrontEnd::convert(input_model, naive); auto infer_request = core.compile_model(model, device, config).create_infer_request(); - ov::serialize(model, "IR.xml"); - auto ov_params = model->get_parameters(); for (size_t i = 0; i < ov_params.size(); i++) { auto param_name = ov_params[i]->get_friendly_name(); diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 367b2829be..0d71963f53 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -44,5 +44,7 @@ ov::AnyMap get_npu_config(); ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string& param_name); +bool is_naive(struct ggml_cgraph* cgraph); + enum ggml_status naive_compute(struct ggml_cgraph* cgraph, ov::Core& core, const std::string& device, const ov::AnyMap& config);