diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 4a45aa2140..b731b26a9a 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -118,6 +118,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) { } auto param_node = std::make_shared(get_ov_type(src), get_graph_input_shape(src)); param_node->set_friendly_name(src_name); + param_node->output(0).get_tensor().set_names({src_name}); m_model_inputs[src_name] = param_node; } } @@ -262,6 +263,7 @@ void GgmlOvDecoder::add_extra_inputs() { std::string name = "past_token_len"; auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); param_node->set_friendly_name(name); + param_node->output(0).get_tensor().set_names({name}); m_model_extra_inputs[name] = param_node; auto tensor = std::make_shared(ov::element::i64, ov::Shape{1}); @@ -280,6 +282,7 @@ void GgmlOvDecoder::add_extra_inputs() { std::string name = "attention_size"; auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); param_node->set_friendly_name(name); + param_node->output(0).get_tensor().set_names({name}); m_model_extra_inputs[name] = param_node; auto tensor = std::make_shared(ov::element::i64, ov::Shape{1}); diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index cd027d2894..1394989395 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -10,6 +11,7 @@ #include #include #include +#include #include #include "../node_context.hpp" @@ -45,16 +47,20 @@ OutputVector translate_mulmat(const NodeContext& context) { auto num_heads_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{num_heads}); auto num_heads_kv_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{num_heads_kv}); + auto factor_node = + ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{kv_num_heads_factor}); auto B_shape_last_two = get_dimensions(B.get_node_shared_ptr(), {1, 2}); - auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - std::shared_ptr new_B_shape = - std::make_shared(ov::OutputVector{num_heads_kv_node, one, B_shape_last_two}, 0); - B = std::make_shared(B, new_B_shape, false); + auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1}); + auto B_unsqueezed = std::make_shared(B, unsqueeze_axes); - B = std::make_shared(ov::OutputVector(kv_num_heads_factor, B), 1); - new_B_shape = std::make_shared(ov::OutputVector{num_heads_node, B_shape_last_two}, 0); - B = std::make_shared(B, new_B_shape, false); + auto broadcast_shape = std::make_shared( + ov::OutputVector{num_heads_kv_node, factor_node, B_shape_last_two}, 0); + auto B_broadcasted = std::make_shared(B_unsqueezed, broadcast_shape); + + auto new_B_shape = + std::make_shared(ov::OutputVector{num_heads_node, B_shape_last_two}, 0); + B = std::make_shared(B_broadcasted, new_B_shape, false); } auto result_lp = std::make_shared(A, B, false, true);