Fix add_sliced_mask; Revert mulmat, softmax; Remove input attention_size, iSWA model not working

This commit is contained in:
Yu, Zijun 2025-10-10 13:17:12 +08:00 committed by Mustafa Cavus
parent a9371ea646
commit 8b82d1153b
6 changed files with 38 additions and 57 deletions

View File

@ -73,7 +73,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph,
set_input_output(cur_node);
}
add_extra_inputs();
// add_extra_inputs();
}
GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph,
@ -336,9 +336,10 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co
void GgmlOvDecoder::add_extra_inputs() {
// Extra inputs:
// 1. `attention_size`, used in matmul's in the attention block. The shape of those matmul's are 32 aligned,
// 1. `attention_size`, used in FLASH_ATTN where the shape of the matmul's are 256 aligned,
// see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding.
// Not used for NPU
// Not used for NPU.
// Update: not used anymore after the optimization of making kvcache dynamic (but breaks iSWA models)
int64_t attention_size = -1;
int64_t attention_size_swa = -1;
for (const auto& node : m_nodes) {

View File

@ -2,7 +2,6 @@
#include <openvino/op/broadcast.hpp>
#include <openvino/op/concat.hpp>
#include <openvino/op/convert.hpp>
#include <openvino/op/gather.hpp>
#include <openvino/op/reshape.hpp>
#include <openvino/op/scaled_dot_product_attention.hpp>
#include <openvino/op/transpose.hpp>

View File

@ -59,23 +59,13 @@ OutputVector translate_mulmat(const NodeContext& context) {
auto Z_last_two_dims = get_dimensions(Z.get_node_shared_ptr(), {1, 2});
auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1});
auto Z_unsqueezed = std::make_shared<ov::op::v0::Unsqueeze>(Z, unsqueeze_axes);
Output<Node> batch_small = A_batch_larger ? B_batch_node : A_batch_node;
Output<Node> batch_large = A_batch_larger ? A_batch_node : B_batch_node;
ov::Output<Node> broadcast_shape;
ov::Output<Node> Z_unsqueezed;
if (context.is_static()) {
auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1});
Z_unsqueezed = std::make_shared<ov::op::v0::Unsqueeze>(Z, unsqueeze_axes);
broadcast_shape =
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{batch_small, factor_node, Z_last_two_dims}, 0);
} else {
auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {2});
Z_unsqueezed = std::make_shared<ov::op::v0::Unsqueeze>(Z, unsqueeze_axes);
auto one_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
broadcast_shape =
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{one_1d, batch_small, factor_node, Z_last_two_dims}, 0);
}
auto broadcast_shape =
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{batch_small, factor_node, Z_last_two_dims}, 0);
auto Z_broadcasted = std::make_shared<ov::op::v3::Broadcast>(Z_unsqueezed, broadcast_shape);
auto new_Z_shape = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{batch_large, Z_last_two_dims}, 0);

View File

@ -40,15 +40,6 @@ OutputVector translate_permute(const NodeContext& context) {
}
} else {
auto src = context.get_input(0);
Output<Node> attention_size;
if (context.is_static()) {
attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {INT_MAX});
} else if (op_case == 2) {
attention_size = context.get_input("attention_size");
} else {
attention_size = context.get_input("attention_size_swa");
}
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
if (context.is_static()) {
@ -58,9 +49,8 @@ OutputVector translate_permute(const NodeContext& context) {
src,
ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{-1, src_shape[1], src_shape[2]}),
false);
auto src_slice = std::make_shared<ov::op::v8::Slice>(src_reshaped, zero, attention_size, one, zero);
res = std::make_shared<ov::op::v1::Transpose>(src_slice,
ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2}));
res = std::make_shared<ov::op::v1::Transpose>(
src_reshaped, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2}));
} else {
if (src.get_partial_shape().rank() == 3) {
src = std::make_shared<ov::op::v0::Unsqueeze>(src, zero);

View File

@ -7,10 +7,8 @@
#include <openvino/op/concat.hpp>
#include <openvino/op/constant.hpp>
#include <openvino/op/convert.hpp>
#include <openvino/op/gather.hpp>
#include <openvino/op/matmul.hpp>
#include <openvino/op/multiply.hpp>
#include <openvino/op/unsqueeze.hpp>
#include <openvino/op/slice.hpp>
#include <openvino/op/softmax.hpp>
#include <vector>
@ -59,20 +57,9 @@ OutputVector translate_soft_max(const NodeContext& context) {
} else {
auto token_len = get_dimensions(input_node, {1});
auto mask_node = context.get_input(1);
auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0});
auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1});
auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,2});
auto inp_pos = context.get_input("inp_pos");
auto shape_of_inp_pos = std::make_shared<ov::op::v3::ShapeOf>(inp_pos);
auto gather_inp_pos = std::make_shared<ov::op::v8::Gather>(shape_of_inp_pos, two_1d, zero_1d);
auto stop = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{token_len, gather_inp_pos}, 0);
mask_node_sliced =
std::make_shared<ov::op::v8::Slice>(mask_node, zero_2d, stop, one_2d, axes);
if (!(context.is_static())) {
mask_node_sliced = std::make_shared<ov::op::v0::Unsqueeze>(mask_node_sliced, zero_1d);
}
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
mask_node_sliced = std::make_shared<ov::op::v8::Slice>(mask_node, zero, token_len, one, one);
}
if (mask_node_sliced.get_element_type() != context.get_output_type(0)) {

View File

@ -11,14 +11,15 @@
#include <openvino/op/convert.hpp>
#include <openvino/op/cos.hpp>
#include <openvino/op/divide.hpp>
#include <openvino/op/gather.hpp>
#include <openvino/op/multiply.hpp>
#include <openvino/op/parameter.hpp>
#include <openvino/op/range.hpp>
#include <openvino/op/reshape.hpp>
#include <openvino/op/result.hpp>
#include <openvino/op/sin.hpp>
#include <openvino/op/slice.hpp>
#include <openvino/op/squeeze.hpp>
#include <openvino/op/strided_slice.hpp>
#include <openvino/op/transpose.hpp>
#include <openvino/op/unsqueeze.hpp>
#include <openvino/pass/constant_folding.hpp>
@ -88,15 +89,27 @@ void add_sliced_mask(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
if (is_static) {
mask_sliced = mask;
} else {
auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0});
auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1});
auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 0});
auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 1});
auto one_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,2});
auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr();
auto shape_of_inp_pos = std::make_shared<ov::op::v3::ShapeOf>(inp_pos);
auto gather_inp_pos = std::make_shared<ov::op::v8::Gather>(shape_of_inp_pos, two_1d, zero_1d);
auto stop = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{token_len, gather_inp_pos}, 0);
auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 2});
std::shared_ptr<ov::Node> kv_len;
{
auto start = ov::op::v0::Constant::create(element::i64, Shape{3}, {0, 0, -1});
auto stride = ov::op::v0::Constant::create(element::i64, Shape{3}, {1, 1, 1});
auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr();
kv_len = std::make_shared<ov::op::v1::StridedSlice>(
inp_pos, start, start, stride, std::vector<int64_t>{0, 0, 0}, std::vector<int64_t>{1, 1, 1});
}
kv_len = std::make_shared<ov::op::v0::Squeeze>(
kv_len, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1}));
kv_len = std::make_shared<ov::op::v0::Convert>(kv_len, ov::element::i64);
kv_len = std::make_shared<ov::op::v1::Add>(kv_len, one_1d);
auto stop = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{token_len, kv_len}, 0);
mask_sliced =
std::make_shared<ov::op::v8::Slice>(mask, zero_2d, stop, one_2d, axes);
mask_sliced = std::make_shared<ov::op::v0::Unsqueeze>(mask_sliced, zero_1d);
@ -108,7 +121,8 @@ void add_sliced_mask(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
};
create_sliced_mask("KQ_mask", "KQ_mask_sliced", ggml_model_decoder.is_static());
create_sliced_mask("KQ_mask_swa", "KQ_mask_swa_sliced", ggml_model_decoder.is_static());
// swa is not working for the `kv_len` is not correct
// create_sliced_mask("KQ_mask_swa", "KQ_mask_swa_sliced", ggml_model_decoder.is_static());
}
void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
@ -132,7 +146,7 @@ void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
// Create common patterns
void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
add_token_len(tensor_map);
// add_sliced_mask(tensor_map, ggml_model_decoder);
add_sliced_mask(tensor_map, ggml_model_decoder);
add_rope_sin_cos(tensor_map, ggml_model_decoder);
}