From 8b82d1153bdc81905ec40f0bf09db090bb897358 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 10 Oct 2025 13:17:12 +0800 Subject: [PATCH] Fix add_sliced_mask; Revert mulmat, softmax; Remove input attention_size, iSWA model not working --- ggml/src/ggml-openvino/ggml-decoder.cpp | 7 ++-- .../openvino/op/flash_attn_ext.cpp | 1 - ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 20 +++-------- .../src/ggml-openvino/openvino/op/permute.cpp | 14 ++------ .../src/ggml-openvino/openvino/op/softmax.cpp | 19 ++--------- .../openvino/translate_session.cpp | 34 +++++++++++++------ 6 files changed, 38 insertions(+), 57 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 0000319f63..7c6bfe7ee7 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -73,7 +73,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph, set_input_output(cur_node); } - add_extra_inputs(); + // add_extra_inputs(); } GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph, @@ -336,9 +336,10 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co void GgmlOvDecoder::add_extra_inputs() { // Extra inputs: - // 1. `attention_size`, used in matmul's in the attention block. The shape of those matmul's are 32 aligned, + // 1. `attention_size`, used in FLASH_ATTN where the shape of the matmul's are 256 aligned, // see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding. - // Not used for NPU + // Not used for NPU. + // Update: not used anymore after the optimization of making kvcache dynamic (but breaks iSWA models) int64_t attention_size = -1; int64_t attention_size_swa = -1; for (const auto& node : m_nodes) { diff --git a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp index c07a7ccb16..9845fe0a02 100644 --- a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +++ b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp @@ -2,7 +2,6 @@ #include #include #include -#include #include #include #include diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index 3a1ca34166..b4103378eb 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -59,23 +59,13 @@ OutputVector translate_mulmat(const NodeContext& context) { auto Z_last_two_dims = get_dimensions(Z.get_node_shared_ptr(), {1, 2}); + auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1}); + auto Z_unsqueezed = std::make_shared(Z, unsqueeze_axes); + Output batch_small = A_batch_larger ? B_batch_node : A_batch_node; Output batch_large = A_batch_larger ? A_batch_node : B_batch_node; - - ov::Output broadcast_shape; - ov::Output Z_unsqueezed; - if (context.is_static()) { - auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1}); - Z_unsqueezed = std::make_shared(Z, unsqueeze_axes); - broadcast_shape = - std::make_shared(ov::OutputVector{batch_small, factor_node, Z_last_two_dims}, 0); - } else { - auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {2}); - Z_unsqueezed = std::make_shared(Z, unsqueeze_axes); - auto one_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - broadcast_shape = - std::make_shared(ov::OutputVector{one_1d, batch_small, factor_node, Z_last_two_dims}, 0); - } + auto broadcast_shape = + std::make_shared(ov::OutputVector{batch_small, factor_node, Z_last_two_dims}, 0); auto Z_broadcasted = std::make_shared(Z_unsqueezed, broadcast_shape); auto new_Z_shape = std::make_shared(ov::OutputVector{batch_large, Z_last_two_dims}, 0); diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index ea5e417965..5f86f47c1c 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -40,15 +40,6 @@ OutputVector translate_permute(const NodeContext& context) { } } else { auto src = context.get_input(0); - Output attention_size; - if (context.is_static()) { - attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {INT_MAX}); - } else if (op_case == 2) { - attention_size = context.get_input("attention_size"); - } else { - attention_size = context.get_input("attention_size_swa"); - } - auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); if (context.is_static()) { @@ -58,9 +49,8 @@ OutputVector translate_permute(const NodeContext& context) { src, ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{-1, src_shape[1], src_shape[2]}), false); - auto src_slice = std::make_shared(src_reshaped, zero, attention_size, one, zero); - res = std::make_shared(src_slice, - ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); + res = std::make_shared( + src_reshaped, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); } else { if (src.get_partial_shape().rank() == 3) { src = std::make_shared(src, zero); diff --git a/ggml/src/ggml-openvino/openvino/op/softmax.cpp b/ggml/src/ggml-openvino/openvino/op/softmax.cpp index 12db9e82a0..1aa3bf76a0 100644 --- a/ggml/src/ggml-openvino/openvino/op/softmax.cpp +++ b/ggml/src/ggml-openvino/openvino/op/softmax.cpp @@ -7,10 +7,8 @@ #include #include #include -#include #include #include -#include #include #include #include @@ -59,20 +57,9 @@ OutputVector translate_soft_max(const NodeContext& context) { } else { auto token_len = get_dimensions(input_node, {1}); auto mask_node = context.get_input(1); - auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0}); - auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1}); - auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); - auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,2}); - auto inp_pos = context.get_input("inp_pos"); - auto shape_of_inp_pos = std::make_shared(inp_pos); - auto gather_inp_pos = std::make_shared(shape_of_inp_pos, two_1d, zero_1d); - auto stop = std::make_shared(ov::OutputVector{token_len, gather_inp_pos}, 0); - mask_node_sliced = - std::make_shared(mask_node, zero_2d, stop, one_2d, axes); - if (!(context.is_static())) { - mask_node_sliced = std::make_shared(mask_node_sliced, zero_1d); - } + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + mask_node_sliced = std::make_shared(mask_node, zero, token_len, one, one); } if (mask_node_sliced.get_element_type() != context.get_output_type(0)) { diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 0b16c06fd0..e35599084e 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -11,14 +11,15 @@ #include #include #include -#include #include #include #include #include #include #include +#include #include +#include #include #include #include @@ -88,15 +89,27 @@ void add_sliced_mask(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { if (is_static) { mask_sliced = mask; } else { - auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0}); - auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1}); + auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 0}); + auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 1}); + auto one_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); - auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,2}); - auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr(); - auto shape_of_inp_pos = std::make_shared(inp_pos); - auto gather_inp_pos = std::make_shared(shape_of_inp_pos, two_1d, zero_1d); - auto stop = std::make_shared(ov::OutputVector{token_len, gather_inp_pos}, 0); + auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 2}); + + std::shared_ptr kv_len; + { + auto start = ov::op::v0::Constant::create(element::i64, Shape{3}, {0, 0, -1}); + auto stride = ov::op::v0::Constant::create(element::i64, Shape{3}, {1, 1, 1}); + auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr(); + kv_len = std::make_shared( + inp_pos, start, start, stride, std::vector{0, 0, 0}, std::vector{1, 1, 1}); + } + kv_len = std::make_shared( + kv_len, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); + kv_len = std::make_shared(kv_len, ov::element::i64); + kv_len = std::make_shared(kv_len, one_1d); + auto stop = std::make_shared(ov::OutputVector{token_len, kv_len}, 0); + mask_sliced = std::make_shared(mask, zero_2d, stop, one_2d, axes); mask_sliced = std::make_shared(mask_sliced, zero_1d); @@ -108,7 +121,8 @@ void add_sliced_mask(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { }; create_sliced_mask("KQ_mask", "KQ_mask_sliced", ggml_model_decoder.is_static()); - create_sliced_mask("KQ_mask_swa", "KQ_mask_swa_sliced", ggml_model_decoder.is_static()); + // swa is not working for the `kv_len` is not correct + // create_sliced_mask("KQ_mask_swa", "KQ_mask_swa_sliced", ggml_model_decoder.is_static()); } void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { @@ -132,7 +146,7 @@ void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { // Create common patterns void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { add_token_len(tensor_map); - // add_sliced_mask(tensor_map, ggml_model_decoder); + add_sliced_mask(tensor_map, ggml_model_decoder); add_rope_sin_cos(tensor_map, ggml_model_decoder); }