From b02265a5072119cdbdb7ded26a7bb2e8dc26f273 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Sat, 15 Mar 2025 19:32:40 +0800 Subject: [PATCH] 1. In the Prompt process and predict first token stage, the PERMUTE node needs to be integrated into the OV Frontend 2. In the predict latest token stage, the VIEW, CONT, Reshape need to be integrated into the OV Frontend. --- ggml/src/ggml-openvino.cpp | 242 ++++-------------------- ggml/src/ggml-openvino/ggml-decoder.cpp | 40 ++-- ggml/src/ggml-openvino/utils.cpp | 25 ++- 3 files changed, 83 insertions(+), 224 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index d2a21511dd..fd24356412 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -647,168 +647,6 @@ void ggml_backend_openvino_reshape(ggml_tensor *dst) { } void ggml_backend_openvino_view(ggml_tensor *dst) { - - /* - // Case 1: Set the output tensor shape as the same shape of the input tensor [1, 7, 9216], for next CONT node operator - if (dst->ne[0] > dst->ne[1] && (dst->ne[0] * dst->nb[0] != dst->nb[1]) && dst->ne[2] == 1) { - // if (dst->view_offs == 0) { - // return; - // } - ov::Core core; - ov::Shape input_shape{ static_cast(dst->src[0]->ne[2]), static_cast(dst->src[0]->ne[1]), static_cast(dst->src[0]->ne[0])}; - ov::Shape out_shape{ static_cast(dst->ne[2]), static_cast(dst->ne[1]), static_cast(dst->ne[0])}; - - auto input_param = std::make_shared(ov::element::f32, input_shape); - - // auto new_shape_node = ov::op::v0::Constant::create(ov::element::i64, - // ov::Shape{input_shape.size()}, - // std::vector(input_shape.begin(), input_shape.end())); - // auto res = std::make_shared(input_param, new_shape_node, false); - - int64_t split_addr = dst->view_offs / dst->nb[0]; - std::vector begin = { 0, 0, split_addr }; - std::vector end = { static_cast(dst->src[0]->ne[2]), - static_cast(dst->src[0]->ne[1]), - split_addr + static_cast(dst->ne[0]) }; - std::vector strides = { 1, 1, 1 }; - - auto begin_const = ov::op::v0::Constant::create(ov::element::i64, { begin.size() }, begin); - auto end_const = ov::op::v0::Constant::create(ov::element::i64, { end.size() }, end); - auto strides_const = ov::op::v0::Constant::create(ov::element::i64, { strides.size() }, strides); - - std::vector begin_mask = {0, 0, 0}; - std::vector end_mask = {0, 0, 0}; - auto slice = std::make_shared( - input_param, - begin_const, - end_const, - strides_const, - begin_mask, - end_mask - ); - - auto model = std::make_shared(ov::OutputVector{ slice }, - ov::ParameterVector{ input_param }); - - auto compiled_model = core.compile_model(model, "CPU"); - - ov::InferRequest infer_request = compiled_model.create_infer_request(); - - ov::Tensor input_tensor(ov::element::f32, input_shape, dst->src[0]->data); - infer_request.set_input_tensor(0, input_tensor); - - ov::Tensor output_tensor(ov::element::f32, out_shape, dst->data); - infer_request.set_output_tensor(0, output_tensor); - - infer_request.infer(); - } - */ - - - /* - // Case 2: Slice contiguous input tensor [98304, 1, 1] to contiguout output tensor [ 21504, 1, 1] - if (ggml_is_contiguous(dst) && dst->ne[1] == 1 && (dst->ne[0] * dst->nb[0] == dst->nb[1])) { - ov::Core core; - ov::Shape input_shape = { static_cast(dst->src[0]->ne[2]), - static_cast(dst->src[0]->ne[1]), - static_cast(dst->src[0]->ne[0])}; - ov::Shape output_shape = { static_cast(dst->ne[2]), - static_cast(dst->ne[1]), - static_cast(dst->ne[0])}; - auto input_param = std::make_shared(ov::element::f16, input_shape); - - - std::vector begin = { 0, 0, 0 }; - std::vector end = { static_cast(dst->ne[2]), - static_cast(dst->ne[1]), - static_cast(dst->ne[0]) }; - std::vector strides = { 1, 1, 1 }; - - auto begin_const = ov::op::v0::Constant::create(ov::element::i64, { begin.size() }, begin); - auto end_const = ov::op::v0::Constant::create(ov::element::i64, { end.size() }, end); - auto strides_const = ov::op::v0::Constant::create(ov::element::i64, { strides.size() }, strides); - - std::vector begin_mask = {0, 0, 0}; - std::vector end_mask = {0, 0, 0}; - auto slice = std::make_shared( - input_param, - begin_const, - end_const, - strides_const, - begin_mask, - end_mask - ); - - std::shared_ptr model = std::make_shared(ov::OutputVector{ slice }, - ov::ParameterVector{ input_param }); - - auto compiled_model = core.compile_model(model, "CPU"); - ov::InferRequest infer_request = compiled_model.create_infer_request(); - - ov::Tensor input_tensor(ov::element::f16, input_shape, dst->src[0]->data); - ov::Tensor output_tensor(ov::element::f16, output_shape, dst->data); - infer_request.set_input_tensor(0, input_tensor); - infer_request.set_output_tensor(0, output_tensor); - - infer_request.infer(); - } - */ - - /* - // Case 3: Reshape the input tensor [1, 1, 98304] to output tensor [1, 3072, 32](Physical shape) - if (dst->ne[0] < dst->ne[1] && dst->ne[2] == 1) { - ov::Core core; - ov::Shape input_shape = { static_cast(dst->src[0]->ne[2]), - static_cast(dst->src[0]->ne[1]), - static_cast(dst->src[0]->ne[0])}; - ov::Shape output_shape = { static_cast(dst->nb[2]), - static_cast(dst->ne[1]), - static_cast(dst->nb[1] / dst->nb[0])}; - auto input_param = std::make_shared(ov::element::f16, input_shape); - - auto new_shape_node = ov::op::v0::Constant::create(ov::element::i64, - ov::Shape{output_shape.size()}, - std::vector(output_shape.begin(), output_shape.end())); - auto res = std::make_shared(input_param, new_shape_node, false); - - std::shared_ptr model = std::make_shared(ov::OutputVector{res}, - ov::ParameterVector{input_param}); - auto compiled_model = core.compile_model(model, "CPU"); - ov::InferRequest infer_request = compiled_model.create_infer_request(); - - ov::Tensor input_tensor(ov::element::f16, input_shape, dst->src[0]->data); - ov::Tensor output_tensor(ov::element::f16, output_shape, dst->data); - infer_request.set_input_tensor(0, input_tensor); - infer_request.set_output_tensor(0, output_tensor); - - infer_request.infer(); - } - */ - - /* - // Case 4: - if (dst->ne[0] != 1 && dst->ne[1] != 1 && dst->ne[2] !=1) { - - } - */ - - ov::Core core; - ov::Shape input_shape{static_cast(dst->src[0]->ne[2]), static_cast(dst->src[0]->ne[1]), static_cast(dst->src[0]->ne[0])}; - // ov::Shape output_shape{static_cast(dst->ne[2]), static_cast(dst->ne[1]), static_cast(dst->ne[0])}; - auto input_param = std::make_shared(ov::element::f32, input_shape); - - std::shared_ptr model = std::make_shared(ov::OutputVector{input_param}, - ov::ParameterVector{input_param}); - auto compiled_model = core.compile_model(model, "CPU"); - ov::InferRequest infer_request = compiled_model.create_infer_request(); - - ov::Tensor input_tensor(ov::element::f32, input_shape, dst->src[0]->data); - // ov::Tensor output_tensor(ov::element::f32, input_shape, dst->data); - infer_request.set_input_tensor(0, input_tensor); - // infer_request.set_output_tensor(0, output_tensor); - - infer_request.infer(); - GGML_UNUSED(dst); } @@ -823,7 +661,7 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { const size_t element_size = ggml_type_size(src0->type); // Case 1: Both tensors are contiguous - if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) { + if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && (src0->ne[0] * element_size == src0->nb[1])) { ov::Shape input_shape = { static_cast(src0->ne[2]), static_cast(src0->ne[1]), @@ -1152,6 +990,7 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe std::vector permute_indices; std::vector mul_mat_indices; + std::vector add_indices; for (int i = 0; i < cgraph->n_nodes; i++) { if (cgraph->nodes[i]->op == GGML_OP_CONT) { @@ -1168,6 +1007,8 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe permute_indices.push_back(i); } else if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT) { mul_mat_indices.push_back(i); + } else if (cgraph->nodes[i]->op == GGML_OP_ADD) { + add_indices.push_back(i); } } @@ -1177,48 +1018,49 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe bool prompt_process_flag = true; if (cgraph->nodes[0]->ne[1] == 1) { prompt_process_flag = false; - } - // int end_node = cgraph->n_nodes - 1; - // openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag); - // } else { - - for (int i = 0; i < cgraph->n_nodes; i++) { - if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { - ggml_backend_openvino_permute(cgraph->nodes[i]); - // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { - // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); - // } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { - // ggml_backend_openvino_view(cgraph->nodes[i]); - // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { - // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); - } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { - ggml_backend_openvino_transpose(cgraph->nodes[i]); - // } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { - // ggml_backend_openvino_reshape(cgraph->nodes[i]); - } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { - ggml_backend_openvino_cpy(cgraph->nodes[i]); - } else { - // Process a range of nodes with openvino_frontend_compute - int start_index = i; - while (i < cgraph->n_nodes - && std::find(permute_indices.begin(), permute_indices.end(), i) == permute_indices.end() - // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() - // && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() - // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() - // && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() - && std::find(transpose_indices.begin(), transpose_indices.end(), i) == transpose_indices.end() - && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() - ) { - i++; + // int end_node = cgraph->n_nodes - 1; + // openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag); + for (int i = 0; i < cgraph->n_nodes; i++) { + if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { + ggml_backend_openvino_view(cgraph->nodes[i]); + } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { + ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); + } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { + ggml_backend_openvino_reshape(cgraph->nodes[i]); + } else { + // Process a range of nodes with openvino_frontend_compute + int start_index = i; + while (i < cgraph->n_nodes + && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() + && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() + && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() + ) { + i++; + } + if (start_index < i) { + openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag); + } } - if (start_index < i) { - openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag); + } + } else { + for (int i = 0; i < cgraph->n_nodes; i++) { + if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { + ggml_backend_openvino_permute(cgraph->nodes[i]); + } else { + // Process a range of nodes with openvino_frontend_compute + int start_index = i; + while (i < cgraph->n_nodes + && std::find(permute_indices.begin(), permute_indices.end(), i) == permute_indices.end() + ) { + i++; + } + if (start_index < i) { + openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag); + } } } } - // } - return GGML_STATUS_SUCCESS; GGML_UNUSED(backend); diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 4483241481..d91338127a 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -26,7 +26,9 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]) && ggml_is_contiguous(node)) { + if (ggml_is_contiguous(node->src[0]) + && ggml_is_contiguous(node) + && (node->src[0]->ne[0] * node->src[0]->nb[0] == node->src[0]->nb[1])) { inputs[src0_name] = node->src[0]; outputs[node_name] = node; m_input_names.push_back(src0_name); @@ -112,22 +114,31 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop)); m_output_names.push_back(node_name); - int src0_elem_size = ggml_type_size(node->src[0]->type); - int src1_elem_size = ggml_type_size(node->src[1]->type); + // int src0_elem_size = ggml_type_size(node->src[0]->type); + // int src1_elem_size = ggml_type_size(node->src[1]->type); - int src0_logical_rows = node->src[0]->ne[1]; - int src1_logical_rows = node->src[1]->ne[1]; + // int src0_logical_rows = node->src[0]->ne[1]; + // int src1_logical_rows = node->src[1]->ne[1]; - int src0_phys_cols = node->src[0]->nb[0] / src0_elem_size; - int src0_phys_rows = src0_logical_rows; + // int src0_phys_cols = node->src[0]->nb[0] / src0_elem_size; + // int src0_phys_rows = src0_logical_rows; - int src1_phys_cols = node->src[1]->nb[1] / src1_elem_size; - int src1_phys_rows = src1_logical_rows; - ov::Shape src0_phys_shape = {1, static_cast(src0_phys_rows), static_cast(src0_phys_cols) }; - ov::Shape src1_phys_shape = {1, static_cast(src1_phys_rows), static_cast(src1_phys_cols) }; - auto input0_param = std::make_shared(ov::element::f32, src0_phys_shape); - auto input1_param = std::make_shared(ov::element::f16, src1_phys_shape); + // int src1_phys_cols = node->src[1]->nb[1] / src1_elem_size; + // int src1_phys_rows = src1_logical_rows; + // ov::Shape src0_phys_shape = {1, static_cast(src0_phys_rows), static_cast(src0_phys_cols) }; + // ov::Shape src1_phys_shape = {1, static_cast(src1_phys_rows), static_cast(src1_phys_cols) }; + // auto input0_param = std::make_shared(ov::element::f32, src0_phys_shape); + // auto input1_param = std::make_shared(ov::element::f16, src1_phys_shape); + // m_params.push_back(input0_param); + // m_params.push_back(input1_param); + + ov::Shape input0_shape = { static_cast(node->src[0]->ne[2]), + static_cast(node->src[0]->ne[1]), + static_cast(node->src[0]->ne[0])}; + auto input0_param = std::make_shared(ov::element::f32, input0_shape); m_params.push_back(input0_param); + ov::Shape input1_shape = { 1, 1, static_cast(node->src[1]->nb[2] / node->src[1]->nb[0])}; + auto input1_param = std::make_shared(ov::element::f16, input1_shape); m_params.push_back(input1_param); m_continuous = false; @@ -147,7 +158,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map(node->src[0]->ne[2]), // static_cast(node->src[0]->ne[1]), // static_cast(node->src[0]->ne[0])}; - // auto input_param = std::make_shared(ov::element::f32, input_shape); + // auto type = get_input_type(src0_name); + // auto input_param = std::make_shared(type, input_shape); // m_params.push_back(input_param); // if (node->ne[0] > node->ne[1] && (node->ne[0] * node->nb[0] != node->nb[1]) && node->ne[2] == 1) { diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index a0adc917e7..b8315a0013 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -27,12 +27,12 @@ std::vector> get_ggml_graph_input_tensors(std printf("Subgraph input %d: %g\n", inp, *(double*)(input_data)); #endif ov::Tensor input_tensor; - auto input_shape = ggml_decoder->get_input_shape(name).to_shape(); + ov::Shape input_shape = ggml_decoder->get_input_shape(name).to_shape(); - if (flag & op_node_name == "CONT" && input_shape[0] == 1 && input_shape[1] != 1) { - std::vector input_stride = ggml_decoder->get_input_stride(name); - ov::element::Type input_type = ggml_decoder->get_input_type(name); - size_t element_size = input_type.size(); + ov::element::Type input_type = ggml_decoder->get_input_type(name); + size_t element_size = input_type.size(); + std::vector input_stride = ggml_decoder->get_input_stride(name); + if (op_node_name == "CONT" && input_shape[0] == 1 && (input_shape[1] != 1 && flag || input_shape[2]*element_size!=input_stride[1])) { const size_t num_rows = static_cast(ggml_decoder->get_input_shape(name).to_shape()[1]); const size_t dim2 = static_cast(ggml_decoder->get_input_shape(name).to_shape()[0]); size_t phys_stride = static_cast(input_stride[1]) / element_size; @@ -42,14 +42,14 @@ std::vector> get_ggml_graph_input_tensors(std std::vector input_stride = ggml_decoder->get_input_stride(name); ov::element::Type input_type = ggml_decoder->get_input_type(name); size_t element_size = input_type.size(); - ov::Shape phys_shape; + // ov::Shape phys_shape; static int iter = 0; if (iter++ % 2 == 0) { - phys_shape = {1, input_shape[1], input_stride[2] / element_size}; - input_tensor = ov::Tensor(ov::element::f32, phys_shape, input_data); + // phys_shape = {1, input_shape[1], input_stride[2] / element_size}; + input_tensor = ov::Tensor(ov::element::f32, input_shape, input_data); } else { - phys_shape = {1, input_shape[1], input_stride[1] / element_size}; - input_tensor = ov::Tensor(ov::element::f16, phys_shape, input_data); + ov::Shape flat_shape = {1, 1, input_stride[0] / element_size}; + input_tensor = ov::Tensor(ov::element::f16, flat_shape, input_data); } } else { input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data); @@ -161,6 +161,11 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c auto output_tensor = infer_request.get_output_tensor(i); // output_tensor.get_shape(); std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size()); + // std::cout << std::left << "[ " << std::setw(2) << i << " ]: " + // << "output_names: " << std::setw(20) << output_names[i] + // << " output data: " << std::setw(15) << ((float*)output_tensor.data())[0] + // << std::setw(15) << ((float*)output_tensor.data())[1] << std::right + // << std::endl; #ifdef GGML_OPENVINO_DEBUG printf("Output %s after: %g\n", output_names[i].c_str(), *(double*)(output_tensor.data())); #endif