Fix add_sliced_mask; Revert mulmat, softmax; Remove input attention_size, iSWA model not working
This commit is contained in:
parent
a9371ea646
commit
8b82d1153b
|
|
@ -73,7 +73,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph,
|
|||
set_input_output(cur_node);
|
||||
}
|
||||
|
||||
add_extra_inputs();
|
||||
// add_extra_inputs();
|
||||
}
|
||||
|
||||
GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph,
|
||||
|
|
@ -336,9 +336,10 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co
|
|||
|
||||
void GgmlOvDecoder::add_extra_inputs() {
|
||||
// Extra inputs:
|
||||
// 1. `attention_size`, used in matmul's in the attention block. The shape of those matmul's are 32 aligned,
|
||||
// 1. `attention_size`, used in FLASH_ATTN where the shape of the matmul's are 256 aligned,
|
||||
// see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding.
|
||||
// Not used for NPU
|
||||
// Not used for NPU.
|
||||
// Update: not used anymore after the optimization of making kvcache dynamic (but breaks iSWA models)
|
||||
int64_t attention_size = -1;
|
||||
int64_t attention_size_swa = -1;
|
||||
for (const auto& node : m_nodes) {
|
||||
|
|
|
|||
|
|
@ -2,7 +2,6 @@
|
|||
#include <openvino/op/broadcast.hpp>
|
||||
#include <openvino/op/concat.hpp>
|
||||
#include <openvino/op/convert.hpp>
|
||||
#include <openvino/op/gather.hpp>
|
||||
#include <openvino/op/reshape.hpp>
|
||||
#include <openvino/op/scaled_dot_product_attention.hpp>
|
||||
#include <openvino/op/transpose.hpp>
|
||||
|
|
|
|||
|
|
@ -59,23 +59,13 @@ OutputVector translate_mulmat(const NodeContext& context) {
|
|||
|
||||
auto Z_last_two_dims = get_dimensions(Z.get_node_shared_ptr(), {1, 2});
|
||||
|
||||
auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1});
|
||||
auto Z_unsqueezed = std::make_shared<ov::op::v0::Unsqueeze>(Z, unsqueeze_axes);
|
||||
|
||||
Output<Node> batch_small = A_batch_larger ? B_batch_node : A_batch_node;
|
||||
Output<Node> batch_large = A_batch_larger ? A_batch_node : B_batch_node;
|
||||
|
||||
ov::Output<Node> broadcast_shape;
|
||||
ov::Output<Node> Z_unsqueezed;
|
||||
if (context.is_static()) {
|
||||
auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1});
|
||||
Z_unsqueezed = std::make_shared<ov::op::v0::Unsqueeze>(Z, unsqueeze_axes);
|
||||
broadcast_shape =
|
||||
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{batch_small, factor_node, Z_last_two_dims}, 0);
|
||||
} else {
|
||||
auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {2});
|
||||
Z_unsqueezed = std::make_shared<ov::op::v0::Unsqueeze>(Z, unsqueeze_axes);
|
||||
auto one_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
|
||||
broadcast_shape =
|
||||
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{one_1d, batch_small, factor_node, Z_last_two_dims}, 0);
|
||||
}
|
||||
auto broadcast_shape =
|
||||
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{batch_small, factor_node, Z_last_two_dims}, 0);
|
||||
auto Z_broadcasted = std::make_shared<ov::op::v3::Broadcast>(Z_unsqueezed, broadcast_shape);
|
||||
|
||||
auto new_Z_shape = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{batch_large, Z_last_two_dims}, 0);
|
||||
|
|
|
|||
|
|
@ -40,15 +40,6 @@ OutputVector translate_permute(const NodeContext& context) {
|
|||
}
|
||||
} else {
|
||||
auto src = context.get_input(0);
|
||||
Output<Node> attention_size;
|
||||
if (context.is_static()) {
|
||||
attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {INT_MAX});
|
||||
} else if (op_case == 2) {
|
||||
attention_size = context.get_input("attention_size");
|
||||
} else {
|
||||
attention_size = context.get_input("attention_size_swa");
|
||||
}
|
||||
|
||||
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
|
||||
|
||||
if (context.is_static()) {
|
||||
|
|
@ -58,9 +49,8 @@ OutputVector translate_permute(const NodeContext& context) {
|
|||
src,
|
||||
ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{-1, src_shape[1], src_shape[2]}),
|
||||
false);
|
||||
auto src_slice = std::make_shared<ov::op::v8::Slice>(src_reshaped, zero, attention_size, one, zero);
|
||||
res = std::make_shared<ov::op::v1::Transpose>(src_slice,
|
||||
ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2}));
|
||||
res = std::make_shared<ov::op::v1::Transpose>(
|
||||
src_reshaped, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2}));
|
||||
} else {
|
||||
if (src.get_partial_shape().rank() == 3) {
|
||||
src = std::make_shared<ov::op::v0::Unsqueeze>(src, zero);
|
||||
|
|
|
|||
|
|
@ -7,10 +7,8 @@
|
|||
#include <openvino/op/concat.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/convert.hpp>
|
||||
#include <openvino/op/gather.hpp>
|
||||
#include <openvino/op/matmul.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <openvino/op/unsqueeze.hpp>
|
||||
#include <openvino/op/slice.hpp>
|
||||
#include <openvino/op/softmax.hpp>
|
||||
#include <vector>
|
||||
|
|
@ -59,20 +57,9 @@ OutputVector translate_soft_max(const NodeContext& context) {
|
|||
} else {
|
||||
auto token_len = get_dimensions(input_node, {1});
|
||||
auto mask_node = context.get_input(1);
|
||||
auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0});
|
||||
auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1});
|
||||
auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
|
||||
auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
|
||||
auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,2});
|
||||
auto inp_pos = context.get_input("inp_pos");
|
||||
auto shape_of_inp_pos = std::make_shared<ov::op::v3::ShapeOf>(inp_pos);
|
||||
auto gather_inp_pos = std::make_shared<ov::op::v8::Gather>(shape_of_inp_pos, two_1d, zero_1d);
|
||||
auto stop = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{token_len, gather_inp_pos}, 0);
|
||||
mask_node_sliced =
|
||||
std::make_shared<ov::op::v8::Slice>(mask_node, zero_2d, stop, one_2d, axes);
|
||||
if (!(context.is_static())) {
|
||||
mask_node_sliced = std::make_shared<ov::op::v0::Unsqueeze>(mask_node_sliced, zero_1d);
|
||||
}
|
||||
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
|
||||
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
|
||||
mask_node_sliced = std::make_shared<ov::op::v8::Slice>(mask_node, zero, token_len, one, one);
|
||||
}
|
||||
|
||||
if (mask_node_sliced.get_element_type() != context.get_output_type(0)) {
|
||||
|
|
|
|||
|
|
@ -11,14 +11,15 @@
|
|||
#include <openvino/op/convert.hpp>
|
||||
#include <openvino/op/cos.hpp>
|
||||
#include <openvino/op/divide.hpp>
|
||||
#include <openvino/op/gather.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <openvino/op/parameter.hpp>
|
||||
#include <openvino/op/range.hpp>
|
||||
#include <openvino/op/reshape.hpp>
|
||||
#include <openvino/op/result.hpp>
|
||||
#include <openvino/op/sin.hpp>
|
||||
#include <openvino/op/slice.hpp>
|
||||
#include <openvino/op/squeeze.hpp>
|
||||
#include <openvino/op/strided_slice.hpp>
|
||||
#include <openvino/op/transpose.hpp>
|
||||
#include <openvino/op/unsqueeze.hpp>
|
||||
#include <openvino/pass/constant_folding.hpp>
|
||||
|
|
@ -88,15 +89,27 @@ void add_sliced_mask(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
|
|||
if (is_static) {
|
||||
mask_sliced = mask;
|
||||
} else {
|
||||
auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0});
|
||||
auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1});
|
||||
auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 0});
|
||||
auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 1});
|
||||
auto one_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
|
||||
auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
|
||||
auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
|
||||
auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,2});
|
||||
auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr();
|
||||
auto shape_of_inp_pos = std::make_shared<ov::op::v3::ShapeOf>(inp_pos);
|
||||
auto gather_inp_pos = std::make_shared<ov::op::v8::Gather>(shape_of_inp_pos, two_1d, zero_1d);
|
||||
auto stop = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{token_len, gather_inp_pos}, 0);
|
||||
auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 2});
|
||||
|
||||
std::shared_ptr<ov::Node> kv_len;
|
||||
{
|
||||
auto start = ov::op::v0::Constant::create(element::i64, Shape{3}, {0, 0, -1});
|
||||
auto stride = ov::op::v0::Constant::create(element::i64, Shape{3}, {1, 1, 1});
|
||||
auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr();
|
||||
kv_len = std::make_shared<ov::op::v1::StridedSlice>(
|
||||
inp_pos, start, start, stride, std::vector<int64_t>{0, 0, 0}, std::vector<int64_t>{1, 1, 1});
|
||||
}
|
||||
kv_len = std::make_shared<ov::op::v0::Squeeze>(
|
||||
kv_len, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1}));
|
||||
kv_len = std::make_shared<ov::op::v0::Convert>(kv_len, ov::element::i64);
|
||||
kv_len = std::make_shared<ov::op::v1::Add>(kv_len, one_1d);
|
||||
auto stop = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{token_len, kv_len}, 0);
|
||||
|
||||
mask_sliced =
|
||||
std::make_shared<ov::op::v8::Slice>(mask, zero_2d, stop, one_2d, axes);
|
||||
mask_sliced = std::make_shared<ov::op::v0::Unsqueeze>(mask_sliced, zero_1d);
|
||||
|
|
@ -108,7 +121,8 @@ void add_sliced_mask(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
|
|||
};
|
||||
|
||||
create_sliced_mask("KQ_mask", "KQ_mask_sliced", ggml_model_decoder.is_static());
|
||||
create_sliced_mask("KQ_mask_swa", "KQ_mask_swa_sliced", ggml_model_decoder.is_static());
|
||||
// swa is not working for the `kv_len` is not correct
|
||||
// create_sliced_mask("KQ_mask_swa", "KQ_mask_swa_sliced", ggml_model_decoder.is_static());
|
||||
}
|
||||
|
||||
void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
|
||||
|
|
@ -132,7 +146,7 @@ void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
|
|||
// Create common patterns
|
||||
void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
|
||||
add_token_len(tensor_map);
|
||||
// add_sliced_mask(tensor_map, ggml_model_decoder);
|
||||
add_sliced_mask(tensor_map, ggml_model_decoder);
|
||||
add_rope_sin_cos(tensor_map, ggml_model_decoder);
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue