From d7cc80229259279b1474f9b66a85489232360728 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 16 May 2025 10:14:05 +0800 Subject: [PATCH] PERF: use Slice+Concat in writing cache_v --- ggml/src/ggml-openvino/openvino/op/cpy.cpp | 62 +++++++++++----------- 1 file changed, 32 insertions(+), 30 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp index 4ab1502f81..0c4a3d1558 100644 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -1,13 +1,17 @@ +#include #include #include #include #include +#include #include +#include #include #include #include #include #include +#include #include #include #include @@ -64,42 +68,40 @@ OutputVector translate_cpy(const NodeContext& context) { } else { // Write V to cache_v int64_t total_head_size = src0_shape[1]; + auto total_head_size_node = ov::op::v0::Constant::create(ov::element::i64, {1}, {total_head_size}); - auto reshaped_src0 = std::make_shared( - src0, - ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{total_head_size, -1}), - false); - auto transposed_src0 = - std::make_shared(reshaped_src0, - ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 0})); + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + + auto token_len = get_dimensions(src0.get_node_shared_ptr(), {2}); + past_token_len = std::make_shared(past_token_len, zero); + auto total_token_len = std::make_shared(past_token_len, token_len); auto reshaped_src1 = std::make_shared( src1, - ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{total_head_size, -1}), - false); - auto transposed_src1 = - std::make_shared(reshaped_src1, - ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 0})); - - auto token_len = get_dimensions(src0.get_node_shared_ptr(), {2}); - token_len = std::make_shared(token_len, - ov::op::v0::Constant::create(ov::element::i64, {0}, {}), - false); - auto total_token_len = std::make_shared(past_token_len, token_len); - std::shared_ptr indices = - std::make_shared(past_token_len, total_token_len, one, ov::element::i64); - indices = std::make_shared( - indices, - ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{1})); - - auto res = std::make_shared(transposed_src1, indices, transposed_src0); - auto transposed_res = - std::make_shared(res, ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 0})); - auto reshaped_res = std::make_shared( - transposed_res, ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, -1}), false); - return {reshaped_res}; + + auto src1_left = std::make_shared( + reshaped_src1, + ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, 0}), + std::make_shared(ov::OutputVector{one, total_head_size_node, past_token_len}, 0), + ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1})); + + auto src1_right = std::make_shared( + reshaped_src1, + std::make_shared(ov::OutputVector{zero, zero, total_token_len}, 0), + ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, INT_MAX}), + ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1})); + + auto reshaped_src0 = std::make_shared( + src0, + ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, -1}), + false); + + auto res = std::make_shared(ov::OutputVector{src1_left, reshaped_src0, src1_right}, 2); + + return {res}; } }