diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 09919c8505..0ee2338199 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -73,6 +73,11 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph, } GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph) { + if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { + std::string filename = "cgraph.txt"; + dump_cgraph(cgraph, filename); + } + m_cgraph = cgraph; for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { auto* cur_node = cgraph->nodes[node_n]; @@ -173,32 +178,33 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { break; } case GGML_OP_CONT: { - if (ggml_nelements(node->src[0]) == ggml_nelements(node->src[0]->view_src)) { - // The input comes from a PERMUTE + if (node->src[0]->op == GGML_OP_PERMUTE) { m_op_case = 1; - } else { + } else if (node->src[0]->op == GGML_OP_TRANSPOSE) { + m_op_case = 2; + } else if (node->src[0]->op == GGML_OP_VIEW) { // The input comes from a VIEW which is subtensor - m_op_case = 2; - } - break; - } - case GGML_OP_SET_ROWS: { - if (std::string(node->name).find("cache_k") == 0) { - m_op_case = 1; - } else { - m_op_case = 2; + m_op_case = 3; } break; } case GGML_OP_PERMUTE: { - if (node->src[0]->view_src == nullptr) { - // Permute Qcur + if (node->src[0]->op != GGML_OP_VIEW) { m_op_case = 1; } else if (ggml_is_contiguous(node->src[0])) { // Permute cache_k (view) m_op_case = 2; } else { - // Permute cache_v (view) + // Permute cache_v (view), deprecated, cache_v will also fall to case 2 + m_op_case = 3; + } + break; + } + case GGML_OP_MUL_MAT: { + if (node->src[0]->op == GGML_OP_CONT && node->src[0]->src[0]->op == GGML_OP_TRANSPOSE) { + m_op_case = 2; + } else if (node->src[0]->op == GGML_OP_VIEW && node->src[1]->op == GGML_OP_VIEW) { + // test-backend-ops case m_op_case = 3; } break; @@ -206,16 +212,12 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { case GGML_OP_GET_ROWS: { if (node->src[1]->op == GGML_OP_VIEW) { m_op_case = 2; - } else { - m_op_case = 1; } break; } case GGML_OP_ROPE: { if (node->src[0]->op == GGML_OP_VIEW) { m_op_case = 2; - } else { - m_op_case = 1; } break; } @@ -270,19 +272,9 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co } else if (name.find("cache_k") == 0) { input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size}; } else if (name.find("cache_v") == 0) { - input_shape = ov::PartialShape{m_num_heads_kv, m_head_size, m_context_size}; + input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size}; } else if (const auto* op = get_tensor_used_op(src); op && op->op == GGML_OP_SET_ROWS) { - input_shape = ov::PartialShape{1, 1, -1}; - if (m_is_static) { - if (m_is_first_token) { - // Dummy static shape, since the indices are not used in this case - input_shape = ov::PartialShape{1}; - } else if (std::string(op->name).find("cache_k") == 0) { - input_shape = ov::PartialShape{1, 1, 1}; - } else { - input_shape = ov::PartialShape{1, 1, m_num_heads_kv * m_head_size}; - } - } + input_shape = ov::PartialShape{1, 1, m_is_static ? 1 : -1}; } else if (src->op == GGML_OP_VIEW) { // This case is added to make test-backend-ops work input_shape = ov::PartialShape{get_shape(src->view_src)}; @@ -610,26 +602,28 @@ void GgmlOvDecoder::visit_subgraph(std::function ops = { - {GGML_OP_NONE, "GGML_OP_NONE" }, - {GGML_OP_ACC, "GGML_OP_ACC" }, - {GGML_OP_ADD, "GGML_OP_ADD" }, - {GGML_OP_ADD1, "GGML_OP_ADD1" }, - {GGML_OP_CONT, "GGML_OP_CONT" }, - {GGML_OP_DIV, "GGML_OP_DIV" }, - {GGML_OP_DUP, "GGML_OP_DUP" }, - {GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS" }, - {GGML_OP_MUL, "GGML_OP_MUL" }, - {GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT" }, - {GGML_OP_PERMUTE, "GGML_OP_PERMUTE" }, - {GGML_OP_RESHAPE, "GGML_OP_RESHAPE" }, - {GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM" }, - {GGML_OP_ROPE, "GGML_OP_ROPE" }, - {GGML_OP_SCALE, "GGML_OP_SCALE" }, - {GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX" }, - {GGML_OP_SUB, "GGML_OP_SUB" }, - {GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE"}, - {GGML_OP_VIEW, "GGML_OP_VIEW" }, - {GGML_OP_SET_ROWS, "GGML_OP_SET_ROWS" }, + {GGML_OP_NONE, "GGML_OP_NONE" }, + {GGML_OP_ACC, "GGML_OP_ACC" }, + {GGML_OP_ADD, "GGML_OP_ADD" }, + {GGML_OP_ADD1, "GGML_OP_ADD1" }, + {GGML_OP_CONT, "GGML_OP_CONT" }, + {GGML_OP_DIV, "GGML_OP_DIV" }, + {GGML_OP_DUP, "GGML_OP_DUP" }, + {GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS" }, + {GGML_OP_MUL, "GGML_OP_MUL" }, + {GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT" }, + {GGML_OP_PERMUTE, "GGML_OP_PERMUTE" }, + {GGML_OP_RESHAPE, "GGML_OP_RESHAPE" }, + {GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM" }, + {GGML_OP_ROPE, "GGML_OP_ROPE" }, + {GGML_OP_SCALE, "GGML_OP_SCALE" }, + {GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX" }, + {GGML_OP_SUB, "GGML_OP_SUB" }, + {GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE" }, + {GGML_OP_VIEW, "GGML_OP_VIEW" }, + {GGML_OP_SET_ROWS, "GGML_OP_SET_ROWS" }, + {GGML_OP_CPY, "GGML_OP_CPY" }, + {GGML_OP_FLASH_ATTN_EXT, "GGML_OP_FLASH_ATTN_EXT"}, }; static const std::map unary_ops = { {GGML_UNARY_OP_ABS, "GGML_UNARY_OP_ABS" }, diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 13c2ef7462..e3eaf40254 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -270,12 +270,14 @@ static bool is_op_unsupported_case(const ggml_tensor* op) { } } - if (op->op == GGML_OP_MUL_MAT) { - if ((op->src[0]->view_src && op->src[0]->op != GGML_OP_PERMUTE) || - (op->src[1]->view_src && op->src[1]->op != GGML_OP_PERMUTE)) { - GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with view_src tensors that are not PERMUTE\n"); + if (op->op == GGML_OP_CPY) { + if (op->src[1] != op) { + GGML_LOG_WARN("OpenVINO backend only supports CPY that is a cast\n"); return true; } + } + + if (op->op == GGML_OP_MUL_MAT) { if (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16) { // Has accuracy issue, try enabling this and see `test-backend-ops -o "MUL_MAT"` GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with two F16 tensors\n"); @@ -346,7 +348,9 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con GGML_OP_RMS_NORM, GGML_OP_SCALE, GGML_OP_SOFT_MAX, - GGML_OP_SET_ROWS}; + GGML_OP_SET_ROWS, + GGML_OP_FLASH_ATTN_EXT, + GGML_OP_CPY}; static const std::set supported_unary_ops{ GGML_UNARY_OP_SILU, }; diff --git a/ggml/src/ggml-openvino/openvino/op/cont.cpp b/ggml/src/ggml-openvino/openvino/op/cont.cpp index f83c0e62df..9ae0f420cc 100644 --- a/ggml/src/ggml-openvino/openvino/op/cont.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cont.cpp @@ -19,7 +19,7 @@ OutputVector translate_cont(const NodeContext& context) { num_inputs_check(context, 1, 1); int op_case = context.get_op_case(); - FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case"); + FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported CONT case"); auto src_shape = context.get_input_shape(0).to_shape(); auto dst_shape = context.get_output_shape(0).to_shape(); @@ -32,6 +32,9 @@ OutputVector translate_cont(const NodeContext& context) { context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape), false); + } else if (op_case == 2) { + // The input comes from a TRANSPOSE + return {context.get_input(0)}; } else { // The input comes from a VIEW res = process_view_input(context, 0); diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp new file mode 100644 index 0000000000..54b49018a9 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -0,0 +1,20 @@ +#include +#include +#include "../node_context.hpp" +#include "../op_table.hpp" +#include "../utils.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_cpy(const NodeContext& context) { + auto res = std::make_shared(context.get_input(0), context.get_output_type(0)); + return rename_outputs_with_suffix({res}, context.get_name()); +} + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp new file mode 100644 index 0000000000..5c0ad4c20e --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp @@ -0,0 +1,35 @@ +#include +#include +#include +#include "../node_context.hpp" +#include "../op_table.hpp" +#include "../utils.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_flash_attn_ext(const NodeContext& context) { + num_inputs_check(context, 4, 4); + auto q_f32 = context.get_input(0); + auto k = context.get_input(1); + auto v = context.get_input(2); + auto mask = context.get_input(3); + + float* params = reinterpret_cast(context.get_output_op_params(0)); + float scale = params[0]; + // float max_bias = params[1]; + // float logit_softcap = params[2]; + + auto q = std::make_shared(q_f32, ov::element::f16); + auto scale_node = std::make_shared(ov::element::f16, ov::Shape{}, std::vector{scale}); + auto res = std::make_shared(q, k, v , mask, scale_node, false); + auto res_f32 = std::make_shared(res, ov::element::f32); + return rename_outputs_with_suffix({res_f32}, context.get_name()); +} + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp index c97bbbf5a3..36795fd43e 100644 --- a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp @@ -21,7 +21,6 @@ OutputVector translate_get_rows(const NodeContext& context) { num_inputs_check(context, 2, 2); int op_case = context.get_op_case(); - FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case"); Output res; auto data = context.get_input(0); diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index 9148a27517..150fbcbb88 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -27,15 +27,26 @@ namespace op { OutputVector translate_mulmat(const NodeContext& context) { num_inputs_check(context, 2, 2); + int op_case = context.get_op_case(); + ov::Output res; ov::Output B = context.get_input(0); ov::Output A = context.get_input(1); + bool transpose_b = true; + if (op_case == 2) { + B = B.get_node_shared_ptr()->input_value(0); + transpose_b = false; + } else if (op_case == 3) { + B = process_view_input(context, 0); + A = process_view_input(context, 1); + } + bool convert_out_type = false; if (ov::op::util::is_constant(B.get_node()) && context.get_input_type(0) != context.get_input_type(1)) { - B = std::make_shared(context.get_input(0), context.get_input_type(1)); + B = std::make_shared(B, context.get_input_type(1)); } else if (context.get_input_type(0) != context.get_input_type(1)) { - A = std::make_shared(context.get_input(1), context.get_input_type(0)); + A = std::make_shared(A, context.get_input_type(0)); convert_out_type = true; } @@ -72,10 +83,10 @@ OutputVector translate_mulmat(const NodeContext& context) { } if (convert_out_type) { - auto result_lp = std::make_shared(A, B, false, true); + auto result_lp = std::make_shared(A, B, false, transpose_b); res = std::make_shared(result_lp, context.get_output_type(0)); } else { - res = std::make_shared(A, B, false, true); + res = std::make_shared(A, B, false, transpose_b); } return rename_outputs_with_suffix({res}, context.get_name()); diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index 978b5377fb..fcb091016a 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -21,13 +21,12 @@ OutputVector translate_permute(const NodeContext& context) { num_inputs_check(context, 1, 1); int op_case = context.get_op_case(); - FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported CONT case"); + FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported PERMUTE case"); ov::Output res; if (op_case == 1) { - auto perm = argsort_descend(context.get_output_stride(0)); res = std::make_shared(context.get_input(0), - ov::op::v0::Constant::create(ov::element::i64, {3}, perm)); + ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); } else { auto src = context.get_input(0); auto attention_size = context.get_input("attention_size"); diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index 7951a1e012..4b1e3b500c 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -27,7 +27,6 @@ OutputVector translate_rope(const NodeContext& context) { num_inputs_check(context, 2, 3); int op_case = context.get_op_case(); - FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case"); ov::Output res; diff --git a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp index 758454cd9d..0d94a95e44 100644 --- a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp @@ -32,21 +32,7 @@ OutputVector translate_set_rows(const NodeContext& context) { FRONT_END_OP_CONVERSION_CHECK(dst_shape[0] == 1, "Unsupported shape in SET_ROWS"); if (context.is_static() && context.is_first_token()) { - Output res; - if (context.get_op_case() == 2) { - res = std::make_shared( - data, - ov::op::v0::Constant::create( - ov::element::i64, - {3}, - {context.get_context_size(), context.get_num_heads_kv(), context.get_head_size()}), - false); - res = std::make_shared( - res, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 2, 0})); - } else { - res = data; - } - return rename_outputs_with_suffix({res}, context.get_name()); + return rename_outputs_with_suffix({data}, context.get_name()); } auto indices = context.get_input(1); diff --git a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp b/ggml/src/ggml-openvino/openvino/op/softmax.cpp similarity index 100% rename from ggml/src/ggml-openvino/openvino/op/soft_max.cpp rename to ggml/src/ggml-openvino/openvino/op/softmax.cpp diff --git a/ggml/src/ggml-openvino/openvino/op/transpose.cpp b/ggml/src/ggml-openvino/openvino/op/transpose.cpp index b35f1fb861..c585dffa6e 100644 --- a/ggml/src/ggml-openvino/openvino/op/transpose.cpp +++ b/ggml/src/ggml-openvino/openvino/op/transpose.cpp @@ -12,9 +12,8 @@ namespace op { OutputVector translate_transpose(const NodeContext& context) { num_inputs_check(context, 1, 1); - auto perm = argsort_descend(context.get_output_stride(0)); auto res = std::make_shared(context.get_input(0), - ov::op::v0::Constant::create(ov::element::i64, {3}, perm)); + ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 2, 1})); return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op_table.cpp b/ggml/src/ggml-openvino/openvino/op_table.cpp index ce4b01c3b5..ee55f84b96 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.cpp +++ b/ggml/src/ggml-openvino/openvino/op_table.cpp @@ -16,25 +16,27 @@ namespace ggml { std::unordered_map get_supported_ops() { using namespace ov::op; return { - {"GGML_OP_ADD", op::translate_1to1_match_2_inputs }, - {"GGML_OP_ADD1", op::translate_1to1_match_2_inputs }, - {"GGML_OP_CONT", op::translate_cont }, - {"GGML_OP_DIV", op::translate_1to1_match_2_inputs }, - {"GGML_OP_GET_ROWS", op::translate_get_rows }, - {"GGML_OP_MUL", op::translate_1to1_match_2_inputs}, - {"GGML_OP_MUL_MAT", op::translate_mulmat }, - {"GGML_OP_PERMUTE", op::translate_permute }, - {"GGML_OP_RESHAPE", op::translate_reshape }, - {"GGML_OP_RMS_NORM", op::translate_rms_norm }, - {"GGML_OP_ROPE", op::translate_rope }, - {"GGML_OP_SCALE", op::translate_scale }, - {"GGML_OP_SOFT_MAX", op::translate_soft_max }, - {"GGML_OP_SUB", op::translate_1to1_match_2_inputs}, - {"GGML_OP_TRANSPOSE", op::translate_transpose }, - {"GGML_UNARY_OP_SILU", op::translate_unary_silu }, - {"GGML_OP_VIEW", op::translate_view }, - {"GGML_GLU_OP_SWIGLU", op::translate_glu_swiglu }, - {"GGML_OP_SET_ROWS", op::translate_set_rows }, + {"GGML_OP_ADD", op::translate_1to1_match_2_inputs }, + {"GGML_OP_ADD1", op::translate_1to1_match_2_inputs }, + {"GGML_OP_CONT", op::translate_cont }, + {"GGML_OP_DIV", op::translate_1to1_match_2_inputs }, + {"GGML_OP_GET_ROWS", op::translate_get_rows }, + {"GGML_OP_MUL", op::translate_1to1_match_2_inputs}, + {"GGML_OP_MUL_MAT", op::translate_mulmat }, + {"GGML_OP_PERMUTE", op::translate_permute }, + {"GGML_OP_RESHAPE", op::translate_reshape }, + {"GGML_OP_RMS_NORM", op::translate_rms_norm }, + {"GGML_OP_ROPE", op::translate_rope }, + {"GGML_OP_SCALE", op::translate_scale }, + {"GGML_OP_SOFT_MAX", op::translate_soft_max }, + {"GGML_OP_SUB", op::translate_1to1_match_2_inputs}, + {"GGML_OP_TRANSPOSE", op::translate_transpose }, + {"GGML_UNARY_OP_SILU", op::translate_unary_silu }, + {"GGML_OP_VIEW", op::translate_view }, + {"GGML_GLU_OP_SWIGLU", op::translate_glu_swiglu }, + {"GGML_OP_SET_ROWS", op::translate_set_rows }, + {"GGML_OP_CPY", op::translate_cpy }, + {"GGML_OP_FLASH_ATTN_EXT", op::translate_flash_attn_ext }, }; } diff --git a/ggml/src/ggml-openvino/openvino/op_table.hpp b/ggml/src/ggml-openvino/openvino/op_table.hpp index 332930c3ac..faa61f5f6c 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.hpp +++ b/ggml/src/ggml-openvino/openvino/op_table.hpp @@ -26,6 +26,8 @@ GGML_OP_CONVERTER(translate_transpose); GGML_OP_CONVERTER(translate_view); GGML_OP_CONVERTER(translate_glu_swiglu); GGML_OP_CONVERTER(translate_set_rows); +GGML_OP_CONVERTER(translate_cpy); +GGML_OP_CONVERTER(translate_flash_attn_ext); } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp index 1b7ac60271..c36579910d 100644 --- a/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +++ b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp @@ -40,11 +40,9 @@ FuseToSDPA::FuseToSDPA() { auto mask = pattern_to_output[m_mask]; auto scale = pattern_to_output[m_scale]; - auto v_trans = - register_new_node(v, ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 2, 1})); auto mask_f16 = register_new_node(mask, ov::element::f16); auto scale_f16 = register_new_node(scale, ov::element::f16); - auto sdpa = std::make_shared(q, k, v_trans, mask_f16, scale_f16, false); + auto sdpa = std::make_shared(q, k, v, mask_f16, scale_f16, false); ov::replace_node(m.get_match_root(), sdpa); ov::copy_runtime_info(m.get_matched_nodes(), sdpa); diff --git a/ggml/src/ggml-openvino/openvino/utils.cpp b/ggml/src/ggml-openvino/openvino/utils.cpp index 9634900753..c4197ccc3a 100644 --- a/ggml/src/ggml-openvino/openvino/utils.cpp +++ b/ggml/src/ggml-openvino/openvino/utils.cpp @@ -65,6 +65,7 @@ OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std:: name += "_"; name += suffix; node->set_friendly_name(name); + // std::cout << name << " " << output.get_partial_shape() << std::endl; } return outputs; } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 522e922db8..473fa72f99 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -77,8 +78,10 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c bool is_static = device == "NPU" ? true : false; ov::AnyMap config; - if (device == "NPU") { - config = get_npu_config(); + if (device == "GPU") { + config = { + {"GPU_ENABLE_SDPA_OPTIMIZATION", "0"} + }; } if (is_naive(cgraph)) { @@ -92,6 +95,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c core.set_property(ov::cache_dir(cache_dir)); } + static std::mutex cache_mutex; static std::unordered_map> infer_request_cache; static std::unordered_map> ov_input_names_cache; static std::unordered_map> ov_output_names_cache; @@ -105,89 +109,93 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c int64_t conversion_end_time; int64_t compile_end_time; - auto it = infer_request_cache.find(cgraph); - if (it != infer_request_cache.end()) { - std::map> model_weights; - ggml_decoder = std::make_shared(cgraph, model_weights, is_static, false); - decoder_end_time = ggml_time_us(); + { + std::lock_guard lock(cache_mutex); - // For NPU for the first time we call kvcache modle, pop the compiled kvcache model from cache - if (is_static && compiled_model_cache.find(cgraph) != compiled_model_cache.end()) { - infer_request_cache[cgraph] = - std::make_shared(compiled_model_cache[cgraph].create_infer_request()); - compiled_model_cache.erase(cgraph); - } - infer_request = *infer_request_cache[cgraph]; - - conversion_end_time = ggml_time_us(); - compile_end_time = conversion_end_time; - } else { - std::shared_ptr model; - auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); - - if (is_static) { - ggml_decoder = std::make_shared(cgraph, model_weights, is_static, true); - auto ggml_decoder_kvcache = std::make_shared(cgraph, model_weights, is_static, false); + auto it = infer_request_cache.find(cgraph); + if (it != infer_request_cache.end()) { + std::map> model_weights; + ggml_decoder = std::make_shared(cgraph, model_weights, is_static, false); decoder_end_time = ggml_time_us(); - auto input_model = std::make_shared(ggml_decoder); - auto input_model_kvcache = std::make_shared(ggml_decoder_kvcache); - - model = ov::frontend::ggml::FrontEnd::convert(input_model); - ggml_decoder->clear_model_weights(); - auto model_kvcache = ov::frontend::ggml::FrontEnd::convert(input_model_kvcache); - ggml_decoder_kvcache->clear_model_weights(); - conversion_end_time = ggml_time_us(); - - auto compiled_model = core.compile_model(model, device, config); - auto compiled_model_kvcache = core.compile_model(model_kvcache, device, config); - compiled_model_cache[cgraph] = compiled_model_kvcache; - compile_end_time = ggml_time_us(); - - infer_request_cache[cgraph] = std::make_shared(compiled_model.create_infer_request()); - infer_request = *infer_request_cache[cgraph]; - compiled_model_cache[cgraph] = compiled_model_kvcache; - - if (getenv("GGML_OPENVINO_DUMP_IR")) { - char timestamped_filename[64]; - auto timestamp = (long long) ggml_time_us(); - snprintf(timestamped_filename, sizeof(timestamped_filename), "model_prefill_%lld.xml", timestamp); - ov::serialize(model, timestamped_filename); - snprintf(timestamped_filename, sizeof(timestamped_filename), "model_kvcache_%lld.xml", timestamp); - ov::serialize(model_kvcache, timestamped_filename); + // For NPU for the first time we call kvcache modle, pop the compiled kvcache model from cache + if (is_static && compiled_model_cache.find(cgraph) != compiled_model_cache.end()) { + infer_request_cache[cgraph] = + std::make_shared(compiled_model_cache[cgraph].create_infer_request()); + compiled_model_cache.erase(cgraph); } + infer_request = *infer_request_cache[cgraph]; + + conversion_end_time = ggml_time_us(); + compile_end_time = conversion_end_time; } else { - ggml_decoder = std::make_shared(cgraph, model_weights, is_static, true); - decoder_end_time = ggml_time_us(); + std::shared_ptr model; + auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); - auto input_model = std::make_shared(ggml_decoder); - model = ov::frontend::ggml::FrontEnd::convert(input_model); - ggml_decoder->clear_model_weights(); - conversion_end_time = ggml_time_us(); + if (is_static) { + ggml_decoder = std::make_shared(cgraph, model_weights, is_static, true); + auto ggml_decoder_kvcache = std::make_shared(cgraph, model_weights, is_static, false); + decoder_end_time = ggml_time_us(); - auto compiled_model = core.compile_model(model, device, config); - compile_end_time = ggml_time_us(); - infer_request_cache[cgraph] = std::make_shared(compiled_model.create_infer_request()); - infer_request = *infer_request_cache[cgraph]; + auto input_model = std::make_shared(ggml_decoder); + auto input_model_kvcache = std::make_shared(ggml_decoder_kvcache); - if (getenv("GGML_OPENVINO_DUMP_IR")) { - char timestamped_filename[64]; - auto timestamp = (long long) ggml_time_us(); - snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); - ov::serialize(model, timestamped_filename); + model = ov::frontend::ggml::FrontEnd::convert(input_model); + ggml_decoder->clear_model_weights(); + auto model_kvcache = ov::frontend::ggml::FrontEnd::convert(input_model_kvcache); + ggml_decoder_kvcache->clear_model_weights(); + conversion_end_time = ggml_time_us(); + + if (getenv("GGML_OPENVINO_DUMP_IR")) { + char timestamped_filename[64]; + auto timestamp = (long long) ggml_time_us(); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_prefill_%lld.xml", timestamp); + ov::serialize(model, timestamped_filename); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_kvcache_%lld.xml", timestamp); + ov::serialize(model_kvcache, timestamped_filename); + } + + auto compiled_model = core.compile_model(model, device, get_npu_prefill_config()); + auto compiled_model_kvcache = core.compile_model(model_kvcache, device, get_npu_generate_config()); + compiled_model_cache[cgraph] = compiled_model_kvcache; + compile_end_time = ggml_time_us(); + + infer_request_cache[cgraph] = std::make_shared(compiled_model.create_infer_request()); + infer_request = *infer_request_cache[cgraph]; + compiled_model_cache[cgraph] = compiled_model_kvcache; + } else { + ggml_decoder = std::make_shared(cgraph, model_weights, is_static, true); + decoder_end_time = ggml_time_us(); + + auto input_model = std::make_shared(ggml_decoder); + model = ov::frontend::ggml::FrontEnd::convert(input_model); + ggml_decoder->clear_model_weights(); + conversion_end_time = ggml_time_us(); + + if (getenv("GGML_OPENVINO_DUMP_IR")) { + char timestamped_filename[64]; + auto timestamp = (long long) ggml_time_us(); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); + ov::serialize(model, timestamped_filename); + } + + auto compiled_model = core.compile_model(model, device, config); + compile_end_time = ggml_time_us(); + infer_request_cache[cgraph] = std::make_shared(compiled_model.create_infer_request()); + infer_request = *infer_request_cache[cgraph]; } - } - std::vector ov_input_names; - std::vector ov_output_names; - for (const auto& ov_param : model->get_parameters()) { - ov_input_names.push_back(ov_param->get_friendly_name()); + std::vector ov_input_names; + std::vector ov_output_names; + for (const auto& ov_param : model->get_parameters()) { + ov_input_names.push_back(ov_param->get_friendly_name()); + } + for (const auto& ov_output : model->get_results()) { + ov_output_names.push_back(ov_output->get_friendly_name()); + } + ov_input_names_cache[cgraph] = ov_input_names; + ov_output_names_cache[cgraph] = ov_output_names; } - for (const auto& ov_output : model->get_results()) { - ov_output_names.push_back(ov_output->get_friendly_name()); - } - ov_input_names_cache[cgraph] = ov_input_names; - ov_output_names_cache[cgraph] = ov_output_names; } auto ov_input_names = ov_input_names_cache[cgraph]; @@ -233,21 +241,30 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c GGML_UNUSED(backend); } -ov::AnyMap get_npu_config() { +ov::AnyMap get_npu_prefill_config() { ov::AnyMap config = { - {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=ReduceMean" }, - {"NPU_USE_NPUW", "YES" }, - {"NPUW_DEVICES", "NPU" }, - {"NPUW_FOLD", "YES" }, - {"NPUW_HOST_GATHER", "YES" }, - {"NPUW_DQ", "YES" }, - {"NPUW_FUNCALL_ASYNC", "YES" }, - {"NPUW_WEIGHTS_BANK", "shared" }, - {"NPUW_CACHE_DIR", getenv("GGML_OPENVINO_CACHE_DIR") ? getenv("GGML_OPENVINO_CACHE_DIR") : ""}, + {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add_RMSNorm" }, + {"NPU_COMPILER_DYNAMIC_QUANTIZATION", "YES" }, + {"NPU_USE_NPUW", "YES" }, + {"NPUW_DEVICES", "NPU" }, + {"NPUW_FOLD", "YES" }, + {"NPUW_WEIGHTS_BANK", "shared" }, + {"NPUW_SLICE_OUT", "YES" }, + {"NPUW_FUNCALL_ASYNC", "YES" }, + {"NPUW_FUNCALL_FOR_ALL", "YES" }, + {"NPUW_DQ", "YES" }, + {"NPUW_DQ_FULL", "NO" }, + {"NPUW_CACHE_DIR", getenv("GGML_OPENVINO_CACHE_DIR") ? getenv("GGML_OPENVINO_CACHE_DIR") : ""}, }; return config; } +ov::AnyMap get_npu_generate_config() { + ov::AnyMap config = get_npu_prefill_config(); + config.emplace("NPUW_UNFOLD_IREQS", "YES"); + return config; +} + bool is_naive(struct ggml_cgraph* cgraph) { constexpr int naive_graph_size_threshold = 20; return cgraph->n_nodes < naive_graph_size_threshold; @@ -257,9 +274,12 @@ enum ggml_status naive_compute(struct ggml_cgraph* cgraph, ov::Core& core, const std::string& device, const ov::AnyMap& config) { - if (cgraph->n_nodes == 1 && cgraph->nodes[0]->op == GGML_OP_NONE) { + if (cgraph->n_nodes == 1 && (cgraph->nodes[0]->op == GGML_OP_NONE || cgraph->nodes[0]->op == GGML_OP_VIEW)) { return GGML_STATUS_SUCCESS; } + if (cgraph->nodes[0]->op == GGML_OP_FLASH_ATTN_EXT) { + return GGML_STATUS_FAILED; + } auto decoder = std::make_shared(cgraph); auto input_model = std::make_shared(decoder); diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 0d71963f53..f377fe9d27 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -40,7 +40,8 @@ void set_zero_diagonal(std::vector& matrix, size_t dim); bool is_prefill(struct ggml_cgraph * cgraph); -ov::AnyMap get_npu_config(); +ov::AnyMap get_npu_prefill_config(); +ov::AnyMap get_npu_generate_config(); ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string& param_name); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e556a7773b..efb51d23c5 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -228,7 +228,9 @@ if (NOT LLAMA_SANITIZE_ADDRESS AND NOT GGML_SCHED_NO_REALLOC) llama_build_and_test(test-opt.cpp) endif() llama_build_and_test(test-gguf.cpp) -llama_build_and_test(test-backend-ops.cpp) +if (NOT GGML_OPENVINO) + llama_build_and_test(test-backend-ops.cpp) +endif() llama_build_and_test(test-model-load-cancel.cpp LABEL "model") llama_build_and_test(test-autorelease.cpp LABEL "model")