From 57582fda39208b9d3e5324102f5ddaf513f722ef Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Wed, 19 Feb 2025 17:51:07 +0800 Subject: [PATCH] add implementation of CPY when the output tensor is non-contiguous --- ggml/src/ggml-openvino.cpp | 147 ++++++++++++------------------------- 1 file changed, 48 insertions(+), 99 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 99a32b1dfd..dc45f0fe6d 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -529,7 +529,7 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { std::vector full_dst_shape = { dst->ne[2], dst->ne[1], dst->ne[0]}; auto final_shape_const = ov::op::v0::Constant::create(ov::element::i64, { full_dst_shape.size() }, full_dst_shape); - auto model = std::make_shared(ov::NodeVector{ batched_matmul }, ov::ParameterVector{param_src1, param_src0}); + auto model = std::make_shared(ov::NodeVector{ batched_matmul }, ov::ParameterVector{param_src0, param_src1}); ov::Core core; auto compiled_model = core.compile_model(model, "CPU"); @@ -538,8 +538,8 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { // Construct input Tensors: treat src0->data and src1->data as 1D flat data respectively ov::Tensor tensor_src0(ov::element::f16, flat_shape_src0, src0->data); ov::Tensor tensor_src1(ov::element::f32, flat_shape_src1, src1->data); - infer_request.set_input_tensor(0, tensor_src1); - infer_request.set_input_tensor(1, tensor_src0); + infer_request.set_input_tensor(0, tensor_src0); + infer_request.set_input_tensor(1, tensor_src1); ov::Tensor tensor_dst(ov::element::f32, ov::Shape(full_dst_shape.begin(), full_dst_shape.end()), dst->data); infer_request.set_output_tensor(0, tensor_dst); @@ -548,9 +548,6 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { return ; } - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - // Valid shape std::vector eff_shape_src0 = get_effective_shape(src0); std::vector eff_shape_src1 = get_effective_shape(src1); @@ -604,13 +601,13 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { ov::Tensor tensor_dst(ov::element::f32, ov::Shape(eff_shape_dst.begin(), eff_shape_dst.end()), dst->data); std::shared_ptr matmul = std::make_shared(reshape_src1, A_for_mul, false, false); - auto model = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{param_flat_src1, param_flat_src0}); + auto model = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{param_flat_src0, param_flat_src1}); auto compiled_model = core.compile_model(model, "CPU"); auto infer_request = compiled_model.create_infer_request(); - infer_request.set_input_tensor(0, tensor_src1); - infer_request.set_input_tensor(1, tensor_src0); + infer_request.set_input_tensor(0, tensor_src0); + infer_request.set_input_tensor(1, tensor_src1); infer_request.set_output_tensor(0, tensor_dst); infer_request.infer(); } @@ -922,111 +919,63 @@ void ggml_backend_openvino_cpy(struct ggml_tensor *dst) { infer_request.set_output_tensor(0, dst_tensor); infer_request.infer(); } else { - // In this example, the logical shape is [7,3072,1,1]. - // Here we assume that the number of "rows" is 3072 and the number of "columns" is 7. - const size_t num_cols = static_cast(dst->ne[0]); // 7 - const size_t num_rows = static_cast(dst->ne[1]); // 3072 - const size_t total_elems = num_cols * num_rows; // 7 * 3072 = 21504 - - // For src0: - // src0->nb[0] = 12288, so the stride along logical dimension 0 = 12288/4 = 3072 (f32) - // const size_t src_stride0 = 12288 / ggml_type_size(src0->type); // 3072 - const size_t src_stride0 = src0->nb[0] / ggml_type_size(src0->type); // 3072 - - // Construct index array (length 21504), in flat output order (row-first, row length = 7): - // For output flat index n, set: - // r = n / 7, c = n % 7. - // Valid data index corresponding to src0 = c * src_stride0 + r. - std::vector indices; - indices.reserve(total_elems); - for (size_t n = 0; n < total_elems; n++) { - size_t r = n / num_cols; // r in [0,3072) - size_t c = n % num_cols; // c in [0,7) - int64_t idx = static_cast(c * src_stride0 + r); - indices.push_back(idx); + std::vector gather_idx; + for (int row = 0; row < dst->src[0]->ne[1]; row++) { + for (int col = 0; col < dst->src[0]->ne[0]; col++) { + gather_idx.push_back((row*dst->src[0]->nb[1]+col*dst->src[0]->nb[0])/4); + } } + size_t N = gather_idx.size(); + ov::Shape gather_idx_shape = {N, 1}; + std::vector scatter_idx; + for (int row = 0; row < dst->ne[1]; row++) { + for (int col = 0; col < dst->ne[0]; col++) { + scatter_idx.push_back(row * dst->nb[1] / 2 + col); + } + } + ov::Shape scatter_idx_shape = {N, 1}; - // --- Construct OpenVINO calculation graph --- - // 1. Encapsulate src0->data into 1D input Tensor with shape [21504] - ov::Shape flat_shape = { total_elems }; - auto input_param = std::make_shared(ov::element::f32, flat_shape); + // param_src0 shape => 1D, rank=1, size is large enough. For example, row*col= 21504 + some padding, e.g. 80000 + // ov::Shape flat_src0_shape = {80000}; + ov::Shape flat_src0_shape = {dst->src[0]->nb[2]}; + auto param_src0 = std::make_shared(ov::element::f32, flat_src0_shape); - // 2. Constructs an index constant with a shape of [21504] - auto indices_const = ov::op::v0::Constant::create(ov::element::i64, flat_shape, indices); + auto gather_indices_const = ov::op::v0::Constant::create(ov::element::i64, gather_idx_shape, gather_idx); + auto gather_axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto gathered = std::make_shared( + param_src0, gather_indices_const, gather_axis_const); - // 3. Construct axis constant, axis = 0 - auto axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - - // 4. Use the Gather operator to collect valid data. The result shape is [21504], type f32 - auto gathered = std::make_shared(input_param, indices_const, axis_const); - - // 5. Convert data types: f32 to f16 auto converted = std::make_shared(gathered, ov::element::f16); - // 6. Reshape into a 2D tensor with shape [num_rows, num_cols] = [3072,7]. - // Note: row-first arrangement is used here, that is, the 0th dimension represents rows (3072 rows) and the 1st dimension represents columns (7 consecutive elements) - std::vector new_shape = { static_cast(num_rows), static_cast(num_cols) }; - auto reshape_const = ov::op::v0::Constant::create(ov::element::i64, {2}, new_shape); - auto reshaped = std::make_shared(converted, reshape_const, false); + // param_dst_base shape => 1D, rank=1, size够大, e.g. row=3072 => i up to 3071 => offset i*64=196544 + j*2, e.g.200000 + // ov::Shape flat_dst_shape = {200000, 1}; + ov::Shape flat_dst_shape = {dst->nb[2], 1}; + auto param_dst_base = std::make_shared(ov::element::f16, flat_dst_shape); - // 7. To keep consistent with the logical shape of dst [7,3072,1,1] (note: the order of ne arrays in ggml may be different from the intuitive), - // Here we finally need to get a flat continuous result with row-first arrangement of [3072,7] (i.e., 7 consecutive elements per row). - // If you need to expand to 4D, you can further reshape, but here we only focus on two-dimensional valid data. - // Let output_shape = [num_rows, num_cols] = [3072,7] + auto scatter_indices_const = ov::op::v0::Constant::create(ov::element::i64, scatter_idx_shape, scatter_idx); - // 8. Construct model: input is input_param, output is reshaped - auto model = std::make_shared(ov::OutputVector{ reshaped }, ov::ParameterVector{ input_param }); + // ScatterNDUpdate( base, scatter_indices, updates ) + // scatter_indices last dimension = 1 => each index is 1D coordinate + auto scatter = std::make_shared( + param_dst_base, scatter_indices_const, converted + ); + + ov::ParameterVector params = { param_src0, param_dst_base }; + auto model = std::make_shared(ov::OutputVector{ scatter }, params); - ov::Core core; auto compiled_model = core.compile_model(model, "CPU"); auto infer_request = compiled_model.create_infer_request(); - // 9. Construct input Tensor: directly wrap src0->data, shape is flat_shape, type f32 - ov::Tensor input_tensor(ov::element::f32, flat_shape, src0->data); - infer_request.set_input_tensor(0, input_tensor); + ov::Tensor tensor_src0(ov::element::f32, flat_src0_shape, src0->data); + ov::Tensor tensor_dst_base(ov::element::f16, flat_dst_shape, dst->data); - // 10. Since dst is non-contiguous (row spacing is dst->nb[1] = 64 bytes), - // We let the model output to a temporary continuous buffer and then copy it row by row to dst->data. - ov::Shape contig_output_shape = { num_rows, num_cols }; // [3072,7] - // Allocate a temporary buffer (to store f16 data, number of elements = 3072*7) - std::vector temp_output(total_elems); - ov::Tensor output_tensor_contig(ov::element::f16, contig_output_shape, temp_output.data()); - infer_request.set_output_tensor(0, output_tensor_contig); + infer_request.set_input_tensor(0, tensor_src0); + infer_request.set_input_tensor(1, tensor_dst_base); + + ov::Tensor out_tensor(ov::element::f16, flat_dst_shape, dst->data); + infer_request.set_output_tensor(0, out_tensor); - // 11. Execute inference, the computation graph will collect, convert, and reshape to obtain a continuous f16 result infer_request.infer(); - - // 12. Copy temporary output to dst->data by line, considering the non-continuous storage of dst (each line is separated by dst->nb[1] bytes) - // Each line of valid data is num_cols * sizeof(f16) = 7 * 2 = 14 bytes. - uint8_t *dst_ptr = static_cast(dst->data); - size_t dst_row_stride = static_cast(dst->nb[1]); // 64 bytes per row - size_t row_bytes = num_cols * ggml_type_size(dst->type); // 7 * 2 = 14 bytes - for (size_t r = 0; r < num_rows; r++) { - // Temporary output is a continuous two-dimensional array, offset = r * num_cols - uint8_t *src_row_ptr = reinterpret_cast(temp_output.data()) + r * row_bytes; - // Copy row_bytes to the starting address of the dst row - std::memcpy(dst_ptr + r * dst_row_stride, src_row_ptr, row_bytes); - } - - /** - // Non-contiguous case: element-wise copy - for (int64_t i03 = 0; i03 < dst->ne[3]; ++i03) { - for (int64_t i02 = 0; i02 < dst->ne[2]; ++i02) { - for (int64_t i01 = 0; i01 < dst->ne[1]; ++i01) { - for (int64_t i00 = 0; i00 < dst->ne[0]; ++i00) { - const char *src_ptr = static_cast(src0->data) + - i00 * src0->nb[0] + i01 * src0->nb[1] + - i02 * src0->nb[2] + i03 * src0->nb[3]; - - char *dst_ptr = static_cast(dst->data) + - i00 * dst->nb[0] + i01 * dst->nb[1] + - i02 * dst->nb[2] + i03 * dst->nb[3]; - - *(ggml_fp16_t *)dst_ptr = GGML_FP32_TO_FP16(*(const float *)src_ptr); - } - } - } - }*/ } }