From f37fa21a5cf7b6196a0384b052cee22fb28d2a22 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Thu, 6 Mar 2025 01:38:01 +0800 Subject: [PATCH] Change the input and ouput node shape of MUL_MAT operator --- ggml/src/ggml-openvino.cpp | 201 ++++++++++++++++++++----------------- 1 file changed, 111 insertions(+), 90 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 034bd698c3..afd616a338 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -458,68 +458,72 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { const ggml_tensor * src1 = dst->src[1]; // src1 type F32 if(!ggml_is_contiguous(src1) || dst->src[1]->ne[0] * dst->src[1]->nb[0] != dst->src[1]->nb[1]) { - int valid_cols_src0 = dst->src[0]->ne[0]; - int num_rows_src0 = dst->src[0]->ne[1]; - int batch_src0 = dst->src[0]->ne[2]; - int valid_cols_src1 = dst->src[1]->ne[0]; - int num_rows_src1 = dst->src[1]->ne[1]; - int batch_src1 = dst->src[1]->ne[2]; - int row_stride_src0 = dst->src[0]->nb[1] / dst->src[0]->nb[0]; - int batch_stride_src0 = dst->src[0]->nb[2] / dst->src[0]->nb[0]; + int valid_cols_src0 = src0->ne[0]; // 96 + int num_rows_src0 = src0->ne[1]; // 32 + int batch_src0 = src0->ne[2]; // 32 - int row_stride_src1 = dst->src[1]->nb[1] / dst->src[1]->nb[0]; - int batch_stride_src1 = dst->src[1]->nb[2] / dst->src[1]->nb[0]; + int valid_cols_src1 = src1->ne[0]; // 96 + int num_rows_src1 = src1->ne[1]; // 7 + int batch_src1 = src1->ne[2]; // 32 + + // 对 src0:row_stride = nb[1] / nb[0] + int row_stride_src0 = src0->nb[1] / src0->nb[0]; // 6144 / 2 = 3072 + int batch_stride_src0 = src0->nb[2] / src0->nb[0]; // 192 / 2 = 96 + + // 对 src1:row_stride = nb[1] / nb[0] + int row_stride_src1 = src1->nb[1] / src1->nb[0]; // 12288 / 4 = 3072 + int batch_stride_src1 = src1->nb[2] / src1->nb[0]; // 384 / 4 = 96 std::vector indices_src0 = build_indices(valid_cols_src0, num_rows_src0, batch_src0, row_stride_src0, batch_stride_src0); std::vector indices_src1 = build_indices(valid_cols_src1, num_rows_src1, batch_src1, row_stride_src1, batch_stride_src1); - // Total number of elements size_t total_src0 = indices_src0.size(); // = 96 * 32 * 32 size_t total_src1 = indices_src1.size(); // = 96 * 7 * 32 - // Treat src0->data and src1->data as 1D tensors - // Note: The total length of physical data should be enough to cover the last valid element index + 1. - // flat shapes: + ov::Shape orig_shape_src0 = { static_cast(src0->ne[0]), + static_cast(src0->ne[1]), + static_cast(src0->ne[2]), + static_cast(src0->ne[3]) }; + ov::Shape orig_shape_src1 = { static_cast(src1->ne[0]), + static_cast(src1->ne[1]), + static_cast(src1->ne[2]), + static_cast(src1->ne[3]) }; + + auto param_src0 = std::make_shared(ov::element::f16, orig_shape_src0); + auto param_src1 = std::make_shared(ov::element::f32, orig_shape_src1); + ov::Shape flat_shape_src0 = { total_src0 }; ov::Shape flat_shape_src1 = { total_src1 }; - // Same as above - // ov::Shape flat_shape_src0 = { ggml_nelements(src0) }; - // ov::Shape flat_shape_src1 = { ggml_nelements(src1) }; - // Create a Parameter node for collecting non-continuous data - auto param_src0 = std::make_shared(ov::element::f16, flat_shape_src0); - auto param_src1 = std::make_shared(ov::element::f32, flat_shape_src1); + auto flatten_src0 = std::make_shared( + param_src0, + ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{ static_cast(total_src0) }), + false); + auto flatten_src1 = std::make_shared( + param_src1, + ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{ static_cast(total_src1) }), + false); - // Create an index Constant node auto indices_const_src0 = ov::op::v0::Constant::create(ov::element::i64, flat_shape_src0, indices_src0); auto indices_const_src1 = ov::op::v0::Constant::create(ov::element::i64, flat_shape_src1, indices_src1); - - // Use the Gather operator to collect valid data - // axis = 0 auto axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto gathered_src0 = std::make_shared(param_src0, indices_const_src0, axis_const); - auto gathered_src1 = std::make_shared(param_src1, indices_const_src1, axis_const); - // Reshape to batched form: - // For src0: valid matrix size for each batch [num_rows_src0, valid_cols_src0] = [32,96], total batches = 32, - // Therefore, reshape to 3D Tensor: shape = [32, 32, 96] where first dimension is batch. + auto gathered_src0 = std::make_shared(flatten_src0, indices_const_src0, axis_const); + auto gathered_src1 = std::make_shared(flatten_src1, indices_const_src1, axis_const); + std::vector shape_src0_cont = { batch_src0, num_rows_src0, valid_cols_src0 }; auto reshape_src0 = std::make_shared( gathered_src0, ov::op::v0::Constant::create(ov::element::i64, { shape_src0_cont.size() }, shape_src0_cont), false); - // For src1: valid matrix size for each batch [num_rows_src1, valid_cols_src1] = [7,96], batch = 32, - // Reshape to 3D Tensor: shape = [32, 7, 96]. + std::vector shape_src1_cont = { batch_src1, num_rows_src1, valid_cols_src1 }; auto reshape_src1 = std::make_shared( gathered_src1, ov::op::v0::Constant::create(ov::element::i64, { shape_src1_cont.size() }, shape_src1_cont), false); - // For src0, first Convert from F16 to F32 auto src0_f32 = std::make_shared(reshape_src0, ov::element::f32); - - // Use Batched Transpose: swap the last two dimensions, dimension order [0, 2, 1] auto transpose_order = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{0, 2, 1}); auto src0_transposed = std::make_shared(src0_f32, transpose_order); @@ -527,89 +531,105 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { auto B = reshape_src1; auto batched_matmul = std::make_shared(B, A, false, false); - // batched_matmul output: shape = [32,7,32] + auto model = std::make_shared(ov::NodeVector{ batched_matmul }, + ov::ParameterVector{ param_src0, param_src1 }); - auto model = std::make_shared(ov::NodeVector{ batched_matmul }, ov::ParameterVector{param_src0, param_src1}); + ov::Tensor tensor_src0{ ov::element::f16, orig_shape_src0, src0->data }; + ov::Tensor tensor_src1{ ov::element::f32, orig_shape_src1, src1->data }; + ov::Shape output_shape = { static_cast(dst->ne[0]), + static_cast(dst->ne[1]), + static_cast(dst->ne[2]) }; + ov::Tensor tensor_dst(ov::element::f32, output_shape, dst->data); ov::Core core; auto compiled_model = core.compile_model(model, "CPU"); auto infer_request = compiled_model.create_infer_request(); - - // Construct input Tensors: treat src0->data and src1->data as 1D flat data respectively - ov::Tensor tensor_src0(ov::element::f16, flat_shape_src0, src0->data); - ov::Tensor tensor_src1(ov::element::f32, flat_shape_src1, src1->data); infer_request.set_input_tensor(0, tensor_src0); infer_request.set_input_tensor(1, tensor_src1); - - ov::Tensor tensor_dst(ov::element::f32, { dst->ne[0], dst->ne[1], dst->ne[2]}, dst->data); infer_request.set_output_tensor(0, tensor_dst); - infer_request.infer(); return ; } - // Valid shape + int rank = 0; + if (dst->ne[2] == 1 && dst->ne[3] == 1) { + rank = 2; + } else if (dst->ne[3] == 1) { + rank = 3; + } else { + throw std::runtime_error("Only rank 2 or rank 3 are supported in this implementation."); + } + std::vector eff_shape_src0 = get_effective_shape(src0); std::vector eff_shape_src1 = get_effective_shape(src1); std::vector eff_shape_dst = get_effective_shape(dst); - // Determine whether it is batched (effective rank==3) or two-dimensional (rank==2) or one-dimensional (rank==1) - int rank = static_cast(eff_shape_dst.size()); - if (rank != 1 && rank != 2 && rank != 3) - throw std::runtime_error("Only rank 1, 2 or 3 supported"); + ov::Shape orig_shape_src0 = { static_cast(src0->ne[0]), + static_cast(src0->ne[1]), + static_cast(src0->ne[2]), + static_cast(src0->ne[3]) }; + ov::Shape orig_shape_src1 = { static_cast(src1->ne[0]), + static_cast(src1->ne[1]), + static_cast(src1->ne[2]), + static_cast(src1->ne[3]) }; - // Total number of flattened elements - size_t total_src0 = 1; for (auto d : eff_shape_src0) total_src0 *= d; - size_t total_src1 = 1; for (auto d : eff_shape_src1) total_src1 *= d; - - ov::Shape flat_shape_src0 = { total_src0 }; - ov::Shape flat_shape_src1 = { total_src1 }; - // Same as above - // ov::Shape flat_shape_src0 = { ggml_nelements(src0) }; - // ov::Shape flat_shape_src1 = { ggml_nelements(src1) }; - - auto param_flat_src0 = std::make_shared(ov::element::f16, flat_shape_src0); - auto param_flat_src1 = std::make_shared(ov::element::f32, flat_shape_src1); + auto param_src0 = std::make_shared(ov::element::f16, orig_shape_src0); + auto param_src1 = std::make_shared(ov::element::f32, orig_shape_src1); auto reshape_src0 = std::make_shared( - param_flat_src0, + param_src0, ov::op::v0::Constant::create(ov::element::i64, { eff_shape_src0.size() }, eff_shape_src0), false); auto reshape_src1 = std::make_shared( - param_flat_src1, + param_src1, ov::op::v0::Constant::create(ov::element::i64, { eff_shape_src1.size() }, eff_shape_src1), false); - // Convert src0: F16 -> F32 auto src0_f32 = std::make_shared(reshape_src0, ov::element::f32); - // Transpose src0_f32: - // For the 2D case, the shape of reshape_src0 is [3072,9216], and after transposition, it is [9216,3072]. - // For the batched case, assuming the shape is [M, K, Batch], batch-wise transposition is required: use order [0, 2, 1]. ov::Output A_for_mul; - if (rank == 1) { - auto trans_order = ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{1, 0}); + if (rank == 2) { + auto trans_order = ov::op::v0::Constant::create(ov::element::i64, { 2 }, std::vector{1, 0}); A_for_mul = std::make_shared(src0_f32, trans_order); - } else if (rank == 2) { - auto trans_order = ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{1, 0}); - A_for_mul = std::make_shared(src0_f32, trans_order); - } else { // rank == 3 - auto trans_order = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{0, 2, 1}); + } else if (rank == 3) { + auto trans_order = ov::op::v0::Constant::create(ov::element::i64, { 3 }, std::vector{0, 2, 1}); A_for_mul = std::make_shared(src0_f32, trans_order); + } else { + A_for_mul = src0_f32; } + auto matmul = std::make_shared(reshape_src1, A_for_mul, false, false); + + auto matmul_output_shape = matmul->get_output_shape(0); + std::vector final_output_shape; + if (matmul_output_shape.size() == 1) { + final_output_shape = { 1, 1, static_cast(matmul_output_shape[0]) }; + } else if (matmul_output_shape.size() == 2) { + final_output_shape = { 1, static_cast(matmul_output_shape[0]), static_cast(matmul_output_shape[1]) }; + } else { + final_output_shape = { static_cast(matmul_output_shape[0]), static_cast(matmul_output_shape[1]), static_cast(matmul_output_shape[2]) }; + } + + auto reshape_output = std::make_shared( + matmul, + ov::op::v0::Constant::create(ov::element::i64, {3}, final_output_shape), + false + ); + + auto model = std::make_shared(ov::NodeVector{ reshape_output }, + ov::ParameterVector{ param_src0, param_src1 }); + + ov::Tensor tensor_src0{ ov::element::f16, orig_shape_src0, (void *)src0->data }; + ov::Tensor tensor_src1{ ov::element::f32, orig_shape_src1, (void *)src1->data }; + + ov::Shape output_shape = { static_cast(dst->ne[2]), + static_cast(dst->ne[1]), + static_cast(dst->ne[0]) }; + ov::Tensor tensor_dst(ov::element::f32, output_shape, dst->data); + ov::Core core; - ov::Tensor tensor_src0{ov::element::f16, flat_shape_src0, (void *)src0->data}; - ov::Tensor tensor_src1{ov::element::f32, flat_shape_src1, (void *)src1->data}; - ov::Tensor tensor_dst(ov::element::f32, ov::Shape(eff_shape_dst.begin(), eff_shape_dst.end()), dst->data); - - std::shared_ptr matmul = std::make_shared(reshape_src1, A_for_mul, false, false); - auto model = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{param_flat_src0, param_flat_src1}); - // ov::save_model(model, "/home/user/zhan/merge_git_commits/llama.cpp-ov/002_backend_mulmat_model.xml"); - auto compiled_model = core.compile_model(model, "CPU"); auto infer_request = compiled_model.create_infer_request(); - infer_request.set_input_tensor(0, tensor_src0); infer_request.set_input_tensor(1, tensor_src1); infer_request.set_output_tensor(0, tensor_dst); @@ -980,22 +1000,22 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { ggml_backend_openvino_view(cgraph->nodes[i]); - // } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { - // ggml_backend_openvino_cpy(cgraph->nodes[i]); - // } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { - // ggml_backend_openvino_transpose(cgraph->nodes[i]); + } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { + ggml_backend_openvino_cpy(cgraph->nodes[i]); + } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { + ggml_backend_openvino_transpose(cgraph->nodes[i]); } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { ggml_backend_openvino_reshape(cgraph->nodes[i]); - // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { - // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); + } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { + ggml_backend_openvino_mul_mat(cgraph->nodes[i]); } else { // Process a range of nodes with openvino_frontend_compute int start_index = i; while (i < cgraph->n_nodes && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() - // && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() + && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() - // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() + && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() ) { i++; } @@ -1228,6 +1248,7 @@ static const std::set& openvino_ops = []() -> const std::set