From 592d7f8bbb537060ef8861328ebf8e10f804f60c Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 16 Jun 2025 11:46:40 +0800 Subject: [PATCH] Change due to ggml cgraph changes, llama-3.2 CPU work --- ggml/src/ggml-openvino/ggml-decoder.cpp | 9 ++- ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 69 +------------------ .../src/ggml-openvino/openvino/op/permute.cpp | 53 +++++++++++++- ggml/src/ggml-openvino/utils.cpp | 1 + 4 files changed, 60 insertions(+), 72 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 2a95c894f4..7b4456c8d0 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -195,10 +195,15 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) { break; } case GGML_OP_PERMUTE: { - if (ggml_is_contiguous(node->src[0])) { + if (node->src[0]->view_src == nullptr) { + // Permute Qcur m_op_case = 1; - } else { + } else if (ggml_is_contiguous(node->src[0])) { + // Permute cache_k (view) m_op_case = 2; + } else { + // Permute cache_v (view) + m_op_case = 3; } break; } diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index 728ee5cb5f..b94f327a1f 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -34,75 +34,10 @@ OutputVector translate_mulmat(const NodeContext& context) { auto result_lp = std::make_shared(src1, src0, false, true); res = std::make_shared(result_lp, context.get_output_type(0)); } else { - /* - Two cases here: - - 21: [ 96, 32, 32, 1] VIEW k-0 [ 2, 6144, 192, 6144] - [ 196608, 1, 1, 1] 0: NONE cache_k_l0 [ 2, 393216, 393216, 393216] - - 22: [ 96, 7, 32, 1] PERMUTE q-0 [ 4, 12288, 384, 86016] - [ 96, 32, 7, 1] 0: SCALE Qcur-0 [ 4, 384, 12288, 86016] - - 23: [ 32, 7, 32, 1] MUL_MAT kq-0 [ 4, 128, 896, 28672] - [ 96, 32, 32, 1] 0: VIEW k-0 [ 2, 6144, 192, 6144] - [ 96, 7, 32, 1] 1: PERMUTE q-0 [ 4, 12288, 384, 86016] + ov::Output B = context.get_input(0); + ov::Output A = std::make_shared(context.get_input(1), context.get_input_type(0)); - - 20: [ 32, 96, 32, 1] VIEW v-0 [ 2, 128, 12288, 393216] - [ 196608, 1, 1, 1] 0: NONE cache_v_l0 [ 2, 393216, 393216, 393216] - - 25: [ 96, 7, 32, 1] MUL_MAT kqv-0 [ 4, 384, 2688, 86016] - [ 32, 96, 32, 1] 0: VIEW v-0 [ 2, 128, 12288, 393216] - [ 32, 7, 32, 1] 1: SOFT_MAX kq_soft_max_ext-0 [ 4, 128, 896, 28672] - - For case 1, for src0, Reshape + Slice + Transpose - For case 2, for src0, Reshape + Slice - */ - ov::Output A; - ov::Output B; - - auto src0 = context.get_input(0); auto src0_shape = context.get_input_shape(0).to_shape(); - auto src0_stride = context.get_input_stride(0); - auto token_dim = op_case == 2 ? 0 : 2; - - auto attention_size = context.get_input("attention_size"); - - auto src0_perm = argsort_descend(src0_stride); - auto src0_original_shape_ = permute(src0_shape, src0_perm); - std::vector src0_original_shape(src0_original_shape_.begin(), src0_original_shape_.end()); - - if (context.is_static()) { - attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {INT_MAX}); - } - src0_original_shape[token_dim] = -1; - - auto src0_slice_shape = src0_original_shape; - src0_slice_shape.erase(src0_slice_shape.begin() + token_dim); - - auto src0_reshape_shape = - ov::op::v0::Constant::create(ov::element::i64, {src0_original_shape.size()}, src0_original_shape); - auto src0_reshape = std::make_shared(src0, src0_reshape_shape, false); - - std::shared_ptr slice_end; - if (op_case == 2) { - slice_end = std::make_shared( - ov::OutputVector{attention_size, ov::op::v0::Constant::create(ov::element::i64, {2}, src0_slice_shape)}, - 0); - } else { - slice_end = std::make_shared( - ov::OutputVector{ov::op::v0::Constant::create(ov::element::i64, {2}, src0_slice_shape), attention_size}, - 0); - } - auto slice_start = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector(3, 0)); - auto slice_step = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector(3, 1)); - auto src0_slice = std::make_shared(src0_reshape, slice_start, slice_end, slice_step); - - if (op_case == 2) { - B = std::make_shared( - src0_slice, - ov::op::v0::Constant::create(ov::element::i64, {src0_perm.size()}, src0_perm)); - } else { - B = src0_slice; - } - - A = std::make_shared(context.get_input(1), context.get_input_type(0)); - int64_t num_heads = context.get_input_shape(1).to_shape()[0]; int64_t num_heads_kv = src0_shape[0]; int64_t kv_num_heads_factor = num_heads / num_heads_kv; diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index 8e91b61201..8b246f75cd 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -1,4 +1,11 @@ +#include +#include +#include +#include +#include #include +#include +#include #include #include "../node_context.hpp" @@ -13,7 +20,7 @@ OutputVector translate_permute(const NodeContext& context) { num_inputs_check(context, 1, 1); int op_case = context.get_op_case(); - FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case"); + FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported CONT case"); ov::Output res; if (op_case == 1) { @@ -22,8 +29,48 @@ OutputVector translate_permute(const NodeContext& context) { ov::op::v0::Constant::create(ov::element::i64, {3}, perm)); return rename_outputs_with_suffix({res}, context.get_name()); } else { - auto res = context.get_input(0); - return {res}; + auto src = context.get_input(0); + auto attention_size = context.get_input("attention_size"); + if (context.is_static()) { + attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {INT_MAX}); + } + + auto src_shape_ = context.get_input_shape(0).to_shape(); + std::vector src_shape(src_shape_.begin(), src_shape_.end()); + + std::shared_ptr src_reshaped; + if (op_case == 2) { + src_reshaped = std::make_shared( + src, + ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{-1, src_shape[1], src_shape[2]}), + false); + } else { + src_reshaped = std::make_shared( + src, + ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{src_shape[1], src_shape[0], -1}), + false); + } + + auto slice_start = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector(3, 0)); + auto slice_step = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector(3, 1)); + std::shared_ptr slice_end; + if (op_case == 2) { + slice_end = std::make_shared( + ov::OutputVector{attention_size, ov::op::v0::Constant::create(ov::element::i64, {2}, {src_shape[1], src_shape[2]})}, + 0); + } else { + slice_end = std::make_shared( + ov::OutputVector{ov::op::v0::Constant::create(ov::element::i64, {2}, {src_shape[1], src_shape[0]}), attention_size}, + 0); + } + auto src_slice = std::make_shared(src_reshaped, slice_start, slice_end, slice_step); + + if (op_case == 2) { + res = std::make_shared(src_slice, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); + } else { + res = src_slice; + } + return rename_outputs_with_suffix({res}, context.get_name()); } } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index fe46b8a794..44356209ce 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -262,6 +262,7 @@ size_t checksum(const void* data, size_t size) { const uint8_t* bytes = static_cast(data); size_t sum = 0; for (size_t i = 0; i < size; ++i) { + sum += (uint8_t)i; sum += bytes[i]; } return sum;