diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index d2a21511dd..fd24356412 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -647,168 +647,6 @@ void ggml_backend_openvino_reshape(ggml_tensor *dst) { } void ggml_backend_openvino_view(ggml_tensor *dst) { - - /* - // Case 1: Set the output tensor shape as the same shape of the input tensor [1, 7, 9216], for next CONT node operator - if (dst->ne[0] > dst->ne[1] && (dst->ne[0] * dst->nb[0] != dst->nb[1]) && dst->ne[2] == 1) { - // if (dst->view_offs == 0) { - // return; - // } - ov::Core core; - ov::Shape input_shape{ static_cast(dst->src[0]->ne[2]), static_cast(dst->src[0]->ne[1]), static_cast(dst->src[0]->ne[0])}; - ov::Shape out_shape{ static_cast(dst->ne[2]), static_cast(dst->ne[1]), static_cast(dst->ne[0])}; - - auto input_param = std::make_shared(ov::element::f32, input_shape); - - // auto new_shape_node = ov::op::v0::Constant::create(ov::element::i64, - // ov::Shape{input_shape.size()}, - // std::vector(input_shape.begin(), input_shape.end())); - // auto res = std::make_shared(input_param, new_shape_node, false); - - int64_t split_addr = dst->view_offs / dst->nb[0]; - std::vector begin = { 0, 0, split_addr }; - std::vector end = { static_cast(dst->src[0]->ne[2]), - static_cast(dst->src[0]->ne[1]), - split_addr + static_cast(dst->ne[0]) }; - std::vector strides = { 1, 1, 1 }; - - auto begin_const = ov::op::v0::Constant::create(ov::element::i64, { begin.size() }, begin); - auto end_const = ov::op::v0::Constant::create(ov::element::i64, { end.size() }, end); - auto strides_const = ov::op::v0::Constant::create(ov::element::i64, { strides.size() }, strides); - - std::vector begin_mask = {0, 0, 0}; - std::vector end_mask = {0, 0, 0}; - auto slice = std::make_shared( - input_param, - begin_const, - end_const, - strides_const, - begin_mask, - end_mask - ); - - auto model = std::make_shared(ov::OutputVector{ slice }, - ov::ParameterVector{ input_param }); - - auto compiled_model = core.compile_model(model, "CPU"); - - ov::InferRequest infer_request = compiled_model.create_infer_request(); - - ov::Tensor input_tensor(ov::element::f32, input_shape, dst->src[0]->data); - infer_request.set_input_tensor(0, input_tensor); - - ov::Tensor output_tensor(ov::element::f32, out_shape, dst->data); - infer_request.set_output_tensor(0, output_tensor); - - infer_request.infer(); - } - */ - - - /* - // Case 2: Slice contiguous input tensor [98304, 1, 1] to contiguout output tensor [ 21504, 1, 1] - if (ggml_is_contiguous(dst) && dst->ne[1] == 1 && (dst->ne[0] * dst->nb[0] == dst->nb[1])) { - ov::Core core; - ov::Shape input_shape = { static_cast(dst->src[0]->ne[2]), - static_cast(dst->src[0]->ne[1]), - static_cast(dst->src[0]->ne[0])}; - ov::Shape output_shape = { static_cast(dst->ne[2]), - static_cast(dst->ne[1]), - static_cast(dst->ne[0])}; - auto input_param = std::make_shared(ov::element::f16, input_shape); - - - std::vector begin = { 0, 0, 0 }; - std::vector end = { static_cast(dst->ne[2]), - static_cast(dst->ne[1]), - static_cast(dst->ne[0]) }; - std::vector strides = { 1, 1, 1 }; - - auto begin_const = ov::op::v0::Constant::create(ov::element::i64, { begin.size() }, begin); - auto end_const = ov::op::v0::Constant::create(ov::element::i64, { end.size() }, end); - auto strides_const = ov::op::v0::Constant::create(ov::element::i64, { strides.size() }, strides); - - std::vector begin_mask = {0, 0, 0}; - std::vector end_mask = {0, 0, 0}; - auto slice = std::make_shared( - input_param, - begin_const, - end_const, - strides_const, - begin_mask, - end_mask - ); - - std::shared_ptr model = std::make_shared(ov::OutputVector{ slice }, - ov::ParameterVector{ input_param }); - - auto compiled_model = core.compile_model(model, "CPU"); - ov::InferRequest infer_request = compiled_model.create_infer_request(); - - ov::Tensor input_tensor(ov::element::f16, input_shape, dst->src[0]->data); - ov::Tensor output_tensor(ov::element::f16, output_shape, dst->data); - infer_request.set_input_tensor(0, input_tensor); - infer_request.set_output_tensor(0, output_tensor); - - infer_request.infer(); - } - */ - - /* - // Case 3: Reshape the input tensor [1, 1, 98304] to output tensor [1, 3072, 32](Physical shape) - if (dst->ne[0] < dst->ne[1] && dst->ne[2] == 1) { - ov::Core core; - ov::Shape input_shape = { static_cast(dst->src[0]->ne[2]), - static_cast(dst->src[0]->ne[1]), - static_cast(dst->src[0]->ne[0])}; - ov::Shape output_shape = { static_cast(dst->nb[2]), - static_cast(dst->ne[1]), - static_cast(dst->nb[1] / dst->nb[0])}; - auto input_param = std::make_shared(ov::element::f16, input_shape); - - auto new_shape_node = ov::op::v0::Constant::create(ov::element::i64, - ov::Shape{output_shape.size()}, - std::vector(output_shape.begin(), output_shape.end())); - auto res = std::make_shared(input_param, new_shape_node, false); - - std::shared_ptr model = std::make_shared(ov::OutputVector{res}, - ov::ParameterVector{input_param}); - auto compiled_model = core.compile_model(model, "CPU"); - ov::InferRequest infer_request = compiled_model.create_infer_request(); - - ov::Tensor input_tensor(ov::element::f16, input_shape, dst->src[0]->data); - ov::Tensor output_tensor(ov::element::f16, output_shape, dst->data); - infer_request.set_input_tensor(0, input_tensor); - infer_request.set_output_tensor(0, output_tensor); - - infer_request.infer(); - } - */ - - /* - // Case 4: - if (dst->ne[0] != 1 && dst->ne[1] != 1 && dst->ne[2] !=1) { - - } - */ - - ov::Core core; - ov::Shape input_shape{static_cast(dst->src[0]->ne[2]), static_cast(dst->src[0]->ne[1]), static_cast(dst->src[0]->ne[0])}; - // ov::Shape output_shape{static_cast(dst->ne[2]), static_cast(dst->ne[1]), static_cast(dst->ne[0])}; - auto input_param = std::make_shared(ov::element::f32, input_shape); - - std::shared_ptr model = std::make_shared(ov::OutputVector{input_param}, - ov::ParameterVector{input_param}); - auto compiled_model = core.compile_model(model, "CPU"); - ov::InferRequest infer_request = compiled_model.create_infer_request(); - - ov::Tensor input_tensor(ov::element::f32, input_shape, dst->src[0]->data); - // ov::Tensor output_tensor(ov::element::f32, input_shape, dst->data); - infer_request.set_input_tensor(0, input_tensor); - // infer_request.set_output_tensor(0, output_tensor); - - infer_request.infer(); - GGML_UNUSED(dst); } @@ -823,7 +661,7 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { const size_t element_size = ggml_type_size(src0->type); // Case 1: Both tensors are contiguous - if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) { + if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && (src0->ne[0] * element_size == src0->nb[1])) { ov::Shape input_shape = { static_cast(src0->ne[2]), static_cast(src0->ne[1]), @@ -1152,6 +990,7 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe std::vector permute_indices; std::vector mul_mat_indices; + std::vector add_indices; for (int i = 0; i < cgraph->n_nodes; i++) { if (cgraph->nodes[i]->op == GGML_OP_CONT) { @@ -1168,6 +1007,8 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe permute_indices.push_back(i); } else if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT) { mul_mat_indices.push_back(i); + } else if (cgraph->nodes[i]->op == GGML_OP_ADD) { + add_indices.push_back(i); } } @@ -1177,48 +1018,49 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe bool prompt_process_flag = true; if (cgraph->nodes[0]->ne[1] == 1) { prompt_process_flag = false; - } - // int end_node = cgraph->n_nodes - 1; - // openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag); - // } else { - - for (int i = 0; i < cgraph->n_nodes; i++) { - if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { - ggml_backend_openvino_permute(cgraph->nodes[i]); - // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { - // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); - // } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { - // ggml_backend_openvino_view(cgraph->nodes[i]); - // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { - // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); - } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { - ggml_backend_openvino_transpose(cgraph->nodes[i]); - // } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { - // ggml_backend_openvino_reshape(cgraph->nodes[i]); - } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { - ggml_backend_openvino_cpy(cgraph->nodes[i]); - } else { - // Process a range of nodes with openvino_frontend_compute - int start_index = i; - while (i < cgraph->n_nodes - && std::find(permute_indices.begin(), permute_indices.end(), i) == permute_indices.end() - // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() - // && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() - // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() - // && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() - && std::find(transpose_indices.begin(), transpose_indices.end(), i) == transpose_indices.end() - && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() - ) { - i++; + // int end_node = cgraph->n_nodes - 1; + // openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag); + for (int i = 0; i < cgraph->n_nodes; i++) { + if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { + ggml_backend_openvino_view(cgraph->nodes[i]); + } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { + ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); + } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { + ggml_backend_openvino_reshape(cgraph->nodes[i]); + } else { + // Process a range of nodes with openvino_frontend_compute + int start_index = i; + while (i < cgraph->n_nodes + && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() + && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() + && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() + ) { + i++; + } + if (start_index < i) { + openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag); + } } - if (start_index < i) { - openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag); + } + } else { + for (int i = 0; i < cgraph->n_nodes; i++) { + if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { + ggml_backend_openvino_permute(cgraph->nodes[i]); + } else { + // Process a range of nodes with openvino_frontend_compute + int start_index = i; + while (i < cgraph->n_nodes + && std::find(permute_indices.begin(), permute_indices.end(), i) == permute_indices.end() + ) { + i++; + } + if (start_index < i) { + openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag); + } } } } - // } - return GGML_STATUS_SUCCESS; GGML_UNUSED(backend); diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 4483241481..d91338127a 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -26,7 +26,9 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]) && ggml_is_contiguous(node)) { + if (ggml_is_contiguous(node->src[0]) + && ggml_is_contiguous(node) + && (node->src[0]->ne[0] * node->src[0]->nb[0] == node->src[0]->nb[1])) { inputs[src0_name] = node->src[0]; outputs[node_name] = node; m_input_names.push_back(src0_name); @@ -112,22 +114,31 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop)); m_output_names.push_back(node_name); - int src0_elem_size = ggml_type_size(node->src[0]->type); - int src1_elem_size = ggml_type_size(node->src[1]->type); + // int src0_elem_size = ggml_type_size(node->src[0]->type); + // int src1_elem_size = ggml_type_size(node->src[1]->type); - int src0_logical_rows = node->src[0]->ne[1]; - int src1_logical_rows = node->src[1]->ne[1]; + // int src0_logical_rows = node->src[0]->ne[1]; + // int src1_logical_rows = node->src[1]->ne[1]; - int src0_phys_cols = node->src[0]->nb[0] / src0_elem_size; - int src0_phys_rows = src0_logical_rows; + // int src0_phys_cols = node->src[0]->nb[0] / src0_elem_size; + // int src0_phys_rows = src0_logical_rows; - int src1_phys_cols = node->src[1]->nb[1] / src1_elem_size; - int src1_phys_rows = src1_logical_rows; - ov::Shape src0_phys_shape = {1, static_cast(src0_phys_rows), static_cast(src0_phys_cols) }; - ov::Shape src1_phys_shape = {1, static_cast(src1_phys_rows), static_cast(src1_phys_cols) }; - auto input0_param = std::make_shared(ov::element::f32, src0_phys_shape); - auto input1_param = std::make_shared(ov::element::f16, src1_phys_shape); + // int src1_phys_cols = node->src[1]->nb[1] / src1_elem_size; + // int src1_phys_rows = src1_logical_rows; + // ov::Shape src0_phys_shape = {1, static_cast(src0_phys_rows), static_cast(src0_phys_cols) }; + // ov::Shape src1_phys_shape = {1, static_cast(src1_phys_rows), static_cast(src1_phys_cols) }; + // auto input0_param = std::make_shared(ov::element::f32, src0_phys_shape); + // auto input1_param = std::make_shared(ov::element::f16, src1_phys_shape); + // m_params.push_back(input0_param); + // m_params.push_back(input1_param); + + ov::Shape input0_shape = { static_cast(node->src[0]->ne[2]), + static_cast(node->src[0]->ne[1]), + static_cast(node->src[0]->ne[0])}; + auto input0_param = std::make_shared(ov::element::f32, input0_shape); m_params.push_back(input0_param); + ov::Shape input1_shape = { 1, 1, static_cast(node->src[1]->nb[2] / node->src[1]->nb[0])}; + auto input1_param = std::make_shared(ov::element::f16, input1_shape); m_params.push_back(input1_param); m_continuous = false; @@ -147,7 +158,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map(node->src[0]->ne[2]), // static_cast(node->src[0]->ne[1]), // static_cast(node->src[0]->ne[0])}; - // auto input_param = std::make_shared(ov::element::f32, input_shape); + // auto type = get_input_type(src0_name); + // auto input_param = std::make_shared(type, input_shape); // m_params.push_back(input_param); // if (node->ne[0] > node->ne[1] && (node->ne[0] * node->nb[0] != node->nb[1]) && node->ne[2] == 1) { diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index a0adc917e7..b8315a0013 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -27,12 +27,12 @@ std::vector> get_ggml_graph_input_tensors(std printf("Subgraph input %d: %g\n", inp, *(double*)(input_data)); #endif ov::Tensor input_tensor; - auto input_shape = ggml_decoder->get_input_shape(name).to_shape(); + ov::Shape input_shape = ggml_decoder->get_input_shape(name).to_shape(); - if (flag & op_node_name == "CONT" && input_shape[0] == 1 && input_shape[1] != 1) { - std::vector input_stride = ggml_decoder->get_input_stride(name); - ov::element::Type input_type = ggml_decoder->get_input_type(name); - size_t element_size = input_type.size(); + ov::element::Type input_type = ggml_decoder->get_input_type(name); + size_t element_size = input_type.size(); + std::vector input_stride = ggml_decoder->get_input_stride(name); + if (op_node_name == "CONT" && input_shape[0] == 1 && (input_shape[1] != 1 && flag || input_shape[2]*element_size!=input_stride[1])) { const size_t num_rows = static_cast(ggml_decoder->get_input_shape(name).to_shape()[1]); const size_t dim2 = static_cast(ggml_decoder->get_input_shape(name).to_shape()[0]); size_t phys_stride = static_cast(input_stride[1]) / element_size; @@ -42,14 +42,14 @@ std::vector> get_ggml_graph_input_tensors(std std::vector input_stride = ggml_decoder->get_input_stride(name); ov::element::Type input_type = ggml_decoder->get_input_type(name); size_t element_size = input_type.size(); - ov::Shape phys_shape; + // ov::Shape phys_shape; static int iter = 0; if (iter++ % 2 == 0) { - phys_shape = {1, input_shape[1], input_stride[2] / element_size}; - input_tensor = ov::Tensor(ov::element::f32, phys_shape, input_data); + // phys_shape = {1, input_shape[1], input_stride[2] / element_size}; + input_tensor = ov::Tensor(ov::element::f32, input_shape, input_data); } else { - phys_shape = {1, input_shape[1], input_stride[1] / element_size}; - input_tensor = ov::Tensor(ov::element::f16, phys_shape, input_data); + ov::Shape flat_shape = {1, 1, input_stride[0] / element_size}; + input_tensor = ov::Tensor(ov::element::f16, flat_shape, input_data); } } else { input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data); @@ -161,6 +161,11 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c auto output_tensor = infer_request.get_output_tensor(i); // output_tensor.get_shape(); std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size()); + // std::cout << std::left << "[ " << std::setw(2) << i << " ]: " + // << "output_names: " << std::setw(20) << output_names[i] + // << " output data: " << std::setw(15) << ((float*)output_tensor.data())[0] + // << std::setw(15) << ((float*)output_tensor.data())[1] << std::right + // << std::endl; #ifdef GGML_OPENVINO_DEBUG printf("Output %s after: %g\n", output_names[i].c_str(), *(double*)(output_tensor.data())); #endif