diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 082ab27458..679b030dfa 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -849,6 +849,7 @@ static void ggml_backend_openvino_permute(const struct ggml_tensor * dst) { void ggml_backend_openvino_cpy(struct ggml_tensor *dst) { const struct ggml_tensor *src0 = dst->src[0]; + const struct ggml_tensor *src1 = dst->src[1]; assert(src0 != nullptr); assert(ggml_nelements(dst) == ggml_nelements(src0)); @@ -889,64 +890,81 @@ void ggml_backend_openvino_cpy(struct ggml_tensor *dst) { infer_request.set_output_tensor(0, dst_tensor); infer_request.infer(); } else { - std::vector gather_idx; - for (int row = 0; row < dst->src[0]->ne[1]; row++) { - for (int col = 0; col < dst->src[0]->ne[0]; col++) { - gather_idx.push_back((row*dst->src[0]->nb[1]+col*dst->src[0]->nb[0])/4); - } - } - size_t N = gather_idx.size(); - ov::Shape gather_idx_shape = {N, 1}; - std::vector scatter_idx; - for (int row = 0; row < dst->ne[1]; row++) { - for (int col = 0; col < dst->ne[0]; col++) { - scatter_idx.push_back(row * dst->nb[1] / 2 + col); - } - } - ov::Shape scatter_idx_shape = {N, 1}; + int src0_elem_size = ggml_type_size(src0->type); + int src1_elem_size = ggml_type_size(src1->type); - // param_src0 shape => 1D, rank=1, size is large enough. For example, row*col= 21504 + some padding, e.g. 80000 - // ov::Shape flat_src0_shape = {80000}; - ov::Shape flat_src0_shape = {dst->src[0]->nb[2]}; - auto param_src0 = std::make_shared(ov::element::f32, flat_src0_shape); - // auto param_src00 = std::make_shared(ov::element::f32, flat_src0_shape); + int src0_logical_cols = src0->ne[0]; + int src0_logical_rows = src0->ne[1]; + int src1_logical_cols = src1->ne[0]; + int src1_logical_rows = src1->ne[1]; + + int src0_phys_cols = src0->nb[0] / src0_elem_size; + int src0_phys_rows = src0_logical_rows; + + int src1_phys_cols = src1->nb[1] / src1_elem_size; + int src1_phys_rows = src1_logical_rows; + + ov::Shape src0_phys_shape = {1, static_cast(src0_phys_rows), static_cast(src0_phys_cols) }; + ov::Shape src1_phys_shape = {1, static_cast(src1_phys_rows), static_cast(src1_phys_cols) }; + + size_t logical_elems = static_cast(src0_logical_cols * src0_logical_rows); + size_t src_flat_size = 1 * src0_phys_cols * src0_phys_rows; + size_t dst_flat_size = 1 * src1_phys_rows * src1_phys_cols; + + ov::Core core; + + std::vector gather_idx; + gather_idx.reserve(logical_elems); + for (int row = 0; row < src0_logical_rows; row++) { + for (int col = 0; col < src0_logical_cols; col++) { + gather_idx.push_back(static_cast(row + col * src0_phys_rows)); + } + } + ov::Shape gather_idx_shape = { logical_elems }; + + std::vector scatter_idx; + scatter_idx.reserve(logical_elems); + for (int row = 0; row < src1_logical_rows; row++) { + for (int col = 0; col < src1_logical_cols; col++) { + scatter_idx.push_back(static_cast(row * src1_phys_cols + col)); + } + } + ov::Shape scatter_idx_shape = { logical_elems, 1 }; + + auto param_src0 = std::make_shared(ov::element::f32, src0_phys_shape); + auto param_src1 = std::make_shared(ov::element::f16, src1_phys_shape); + + auto src_flat_shape_const = ov::op::v0::Constant::create(ov::element::i64, {1}, + { static_cast(src_flat_size) }); + auto reshape_src = std::make_shared(param_src0, src_flat_shape_const, false); + auto dst_flat_shape_const = ov::op::v0::Constant::create(ov::element::i64, {1}, + { static_cast(dst_flat_size) }); + auto reshape_dst = std::make_shared(param_src1, dst_flat_shape_const, false); auto gather_indices_const = ov::op::v0::Constant::create(ov::element::i64, gather_idx_shape, gather_idx); - auto gather_axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto gathered = std::make_shared( - param_src0, gather_indices_const, gather_axis_const); - + auto axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto gathered = std::make_shared(reshape_src, gather_indices_const, axis_const); auto converted = std::make_shared(gathered, ov::element::f16); - // param_dst_base shape => 1D, rank=1, size够大, e.g. row=3072 => i up to 3071 => offset i*64=196544 + j*2, e.g.200000 - // ov::Shape flat_dst_shape = {200000, 1}; - ov::Shape flat_dst_shape = {dst->nb[2], 1}; - auto param_dst_base = std::make_shared(ov::element::f16, flat_dst_shape); - // auto param_dst_base11 = std::make_shared(ov::element::f16, flat_dst_shape); - auto scatter_indices_const = ov::op::v0::Constant::create(ov::element::i64, scatter_idx_shape, scatter_idx); + auto scatter = std::make_shared(reshape_dst, scatter_indices_const, converted); - // ScatterNDUpdate( base, scatter_indices, updates ) - // scatter_indices last dimension = 1 => each index is 1D coordinate - auto scatter = std::make_shared( - param_dst_base, scatter_indices_const, converted - ); - - ov::ParameterVector params = { param_src0, param_dst_base }; - // ov::ParameterVector params = { param_src0}; - // ov::ParameterVector params = { param_src00, param_dst_base11}; - auto model = std::make_shared(ov::OutputVector{ scatter }, params); + std::vector dst_phys_shape_vec = {1, static_cast(src1_phys_rows), + static_cast(src1_phys_cols) }; + auto dst_phys_shape_const = ov::op::v0::Constant::create(ov::element::i64, {3}, dst_phys_shape_vec); + auto final_output = std::make_shared(scatter, dst_phys_shape_const, false); + ov::ParameterVector params = { param_src0, param_src1 }; + auto model = std::make_shared(ov::OutputVector{ final_output }, params); auto compiled_model = core.compile_model(model, "CPU"); auto infer_request = compiled_model.create_infer_request(); - ov::Tensor tensor_src0(ov::element::f32, flat_src0_shape, src0->data); - ov::Tensor tensor_dst_base(ov::element::f16, flat_dst_shape, dst->data); + ov::Tensor tensor_src(ov::element::f32, src0_phys_shape, src0->data); + ov::Tensor tensor_dst(ov::element::f16, src1_phys_shape, src1->data); + infer_request.set_input_tensor(0, tensor_src); + infer_request.set_input_tensor(1, tensor_dst); - infer_request.set_input_tensor(0, tensor_src0); - infer_request.set_input_tensor(1, tensor_dst_base); - - ov::Tensor out_tensor(ov::element::f16, flat_dst_shape, dst->data); + ov::Tensor out_tensor(ov::element::f16, src1_phys_shape, dst->data); infer_request.set_output_tensor(0, out_tensor); infer_request.infer(); @@ -986,15 +1004,17 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe // Process nodes in order - // if (cgraph->nodes[0]->ne[1] == 1) { - // bool prompt_process_flag = false; + bool prompt_process_flag = true; + if (cgraph->nodes[0]->ne[1] == 1) { + prompt_process_flag = false; + } // int end_node = cgraph->n_nodes - 1; // openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag); // } else { for (int i = 0; i < cgraph->n_nodes; i++) { if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { - // ggml_backend_openvino_permute(cgraph->nodes[i]); + ggml_backend_openvino_permute(cgraph->nodes[i]); // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { @@ -1020,7 +1040,7 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe i++; } if (start_index < i) { - openvino_frontend_compute(backend, cgraph, start_index, --i); + openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag); } } } diff --git a/ggml/src/ggml-openvino/decoder.h b/ggml/src/ggml-openvino/decoder.h index 729946ac39..584f16986c 100644 --- a/ggml/src/ggml-openvino/decoder.h +++ b/ggml/src/ggml-openvino/decoder.h @@ -36,8 +36,6 @@ public: virtual std::vector get_input_names() const = 0; - virtual const std::string& get_node_op_name(const std::string& name) const = 0; - virtual std::string& get_op_node_name(const std::string& name, const int index = -1) = 0; // virtual const struct tensor_info get_node_op_info(const std::string& name) const = 0; diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 2b04cd632a..218c53f09f 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -6,18 +6,6 @@ #include void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map& inputs, std::map& outputs) { - // m_node_op_name[node->name] = ggml_op_name(node->op); - - // std::string src0_name = std::string(node->src[0]->name) + "_" + std::to_string(node->src[0]->view_offs) + "_input_" + ggml_op_name(node->src[0]->op); - // std::string node_name = std::string(node->name) + "_" + std::to_string(node->view_offs) + "_output_" + ggml_op_name(node->op); - - // Execute singel CONT operator is OK - // std::string src0_name = std::string(node->src[0]->name) + "_" + std::to_string(node->src[0]->view_offs) + "_" + ggml_op_name(node->src[0]->op); - // std::string node_name = std::string(node->name) + "_" + std::to_string(node->view_offs) + "_" + ggml_op_name(node->op); - - // std::string src0_name = std::string(node->src[0]->name) + "_" + std::to_string(node->src[0]->view_offs); - // std::string node_name = std::string(node->name) + "_" + std::to_string(node->view_offs); - std::string src0_name = std::string(node->src[0]->name); std::string node_name = std::string(node->name); @@ -32,7 +20,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]; outputs[node_name] = node; m_input_names.push_back(src0_name); - m_node_op_name[src0_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); break; @@ -43,7 +30,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]; outputs[node_name] = node; m_input_names.push_back(src0_name); - m_node_op_name[src0_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); @@ -64,7 +50,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]; outputs[node_name] = node; m_input_names.push_back(src0_name); - m_node_op_name[src0_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); @@ -87,7 +72,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]; outputs[node_name] = node; m_input_names.push_back(src0_name); - m_node_op_name[src0_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); @@ -107,32 +91,45 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]; outputs[node_name] = node; m_input_names.push_back(src0_name); - m_node_op_name[src0_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); m_continuous = true; - // ov::Shape src_shape(node->src[0]->ne, node->src[0]->ne + 3); - // auto input_param = std::make_shared(ov::element::f32, src_shape); - // m_params.push_back(input_param); + ov::Shape input_shape = { static_cast(node->src[0]->ne[2]), + static_cast(node->src[0]->ne[1]), + static_cast(node->src[0]->ne[0])}; + auto input_param = std::make_shared(ov::element::f32, input_shape); + m_params.push_back(input_param); break; } else { - for (int64_t i1 = 0; i1 < node->ne[1]; ++i1) { // ne[1] = 3072 - for (int64_t i0 = 0; i0 < node->ne[0]; ++i0) { // ne[0] = 7 - int64_t src_index = i0 * node->src[0]->nb[0] / sizeof(float) + // stride in nb[0] - i1 * node->src[0]->nb[1] / sizeof(float); // stride in nb[1] - char *dst_ptr = static_cast(node->data) + - i0 * node->nb[0] + i1 * node->nb[1]; - *(ggml_fp16_t *)dst_ptr = GGML_FP32_TO_FP16(((float*)node->src[0]->data)[src_index]); - } - } - // inputs[node->src[0]->name] = node->src[0]; - inputs[node_name] = node; + std::string src1_name = std::string(node->src[1]->name); + inputs[src0_name] = node->src[0]; + inputs[src1_name] = node->src[1]; outputs[node_name] = node; - m_input_names.push_back(node_name); - m_node_op_name[node_name] = ggml_op_name(node->op); + m_input_names.push_back(src0_name); + m_input_names.push_back(src1_name); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); + m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); + + int src0_elem_size = ggml_type_size(node->src[0]->type); + int src1_elem_size = ggml_type_size(node->src[1]->type); + + int src0_logical_rows = node->src[0]->ne[1]; + int src1_logical_rows = node->src[1]->ne[1]; + + int src0_phys_cols = node->src[0]->nb[0] / src0_elem_size; + int src0_phys_rows = src0_logical_rows; + + int src1_phys_cols = node->src[1]->nb[1] / src1_elem_size; + int src1_phys_rows = src1_logical_rows; + ov::Shape src0_phys_shape = {1, static_cast(src0_phys_rows), static_cast(src0_phys_cols) }; + ov::Shape src1_phys_shape = {1, static_cast(src1_phys_rows), static_cast(src1_phys_cols) }; + auto input0_param = std::make_shared(ov::element::f32, src0_phys_shape); + auto input1_param = std::make_shared(ov::element::f16, src1_phys_shape); + m_params.push_back(input0_param); + m_params.push_back(input1_param); + m_continuous = false; break; @@ -144,7 +141,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop); m_op_node_name.emplace_back(node_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); break; @@ -155,7 +151,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]; outputs[node_name] = node; m_input_names.push_back(src0_name); - m_node_op_name[src0_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); break; @@ -167,17 +162,13 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[1]->name) + "_" + std::to_string(node->src[1]->view_offs) + "_input_" + ggml_op_name(node->src[1]->op); - // std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs); std::string src1_name = std::string(node->src[1]->name); inputs[src0_name] = node->src[0]; inputs[src1_name] = node->src[1]; outputs[node_name] = node; m_input_names.push_back(src0_name); - m_node_op_name[src0_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_input_names.push_back(src1_name); - m_node_op_name[src1_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); break; @@ -193,15 +184,11 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]; outputs[node_name] = node; m_input_names.push_back(src0_name); - m_node_op_name[src0_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); if (node->src[1]) { - // std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs) + "_input_" + ggml_op_name(node->src[1]->op); - // std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs); std::string src1_name = std::string(node->src[1]->name); inputs[src1_name] = node->src[1]; - m_node_op_name[src1_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); m_input_names.push_back(src1_name); } @@ -210,26 +197,19 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[1]->name) + "_" + std::to_string(node->src[1]->view_offs) + "_input_" + ggml_op_name(node->src[1]->op); - // std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs); std::string src1_name = std::string(node->src[1]->name); inputs[src0_name] = node->src[0]; inputs[src1_name] = node->src[1]; m_input_names.push_back(src0_name); - m_node_op_name[src0_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_input_names.push_back(src1_name); - m_node_op_name[src1_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); outputs[node_name] = node; m_output_names.push_back(node_name); if (node->src[2]) { - // std::string src2_name = std::string(node->src[2]->name) + "_" + std::to_string(node->src[2]->view_offs) + "_input_" + ggml_op_name(node->src[2]->op); - // std::string src2_name = std::string(node->src[2]->name) + "_" + std::to_string(node->src[2]->view_offs); std::string src2_name = std::string(node->src[2]->name); inputs[src2_name] = node->src[2]; m_input_names.push_back(src2_name); - m_node_op_name[src2_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src2_name, ggml_op_name(node->op)); } break; @@ -423,12 +403,6 @@ std::vector GgmlOvDecoder::get_input_names() const { return m_input_names; } -const std::string& GgmlOvDecoder::get_node_op_name(const std::string& name) const { - auto it = m_node_op_name.find(name); - static const std::string empty_str; - return (it != m_node_op_name.end()) ? it->second : empty_str; -} - std::string& GgmlOvDecoder::get_op_node_name(const std::string& key_name, const int index) { if (index == -1) { for (size_t i = 0; i < m_op_node_name.size(); ++i) { diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 238f1d79b4..fc1d878409 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -71,7 +71,6 @@ public: return m_continuous; } - virtual const std::string& get_node_op_name(const std::string& name) const override; std::string& get_op_node_name(const std::string& key_name, const int index) override; virtual const std::vector>& get_params() const override; @@ -90,7 +89,6 @@ private: std::string m_op_name; mutable std::string m_name; bool m_continuous; - std::map m_node_op_name; std::vector> m_params; std::vector> m_op_node_name; }; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index c44aa2568b..a0adc917e7 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -22,24 +22,35 @@ std::vector> get_ggml_graph_input_tensors(std std::string op_node_name = ggml_decoder->get_op_node_name(name, op_iter++); // auto node_op_name = ggml_decoder->get_node_op_name(name); auto input_data = ggml_decoder->get_input_ggml_tensor(name)->data; + auto check_if_contiguous = ggml_is_contiguous(ggml_decoder->get_input_ggml_tensor(name)); #ifdef GGML_OPENVINO_DEBUG printf("Subgraph input %d: %g\n", inp, *(double*)(input_data)); #endif ov::Tensor input_tensor; auto input_shape = ggml_decoder->get_input_shape(name).to_shape(); - // if (node_op_name == "CPY" && (input_shape[0] != 7)) { - // input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), {80000}, input_data); if (flag & op_node_name == "CONT" && input_shape[0] == 1 && input_shape[1] != 1) { std::vector input_stride = ggml_decoder->get_input_stride(name); ov::element::Type input_type = ggml_decoder->get_input_type(name); size_t element_size = input_type.size(); - // const size_t valid_elems = static_cast(ggml_decoder->get_input_shape(name).to_shape()[2]); const size_t num_rows = static_cast(ggml_decoder->get_input_shape(name).to_shape()[1]); const size_t dim2 = static_cast(ggml_decoder->get_input_shape(name).to_shape()[0]); size_t phys_stride = static_cast(input_stride[1]) / element_size; ov::Shape input_shape = { dim2, num_rows, phys_stride }; // {1, 7, 9216 } input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); + } else if (op_node_name == "CPY" && (!check_if_contiguous || input_shape[2] == 1)) { //[TODO]: Temporarily determine whether the node corresponding to the input tensor of the Phi-3 model CPY is continuous + std::vector input_stride = ggml_decoder->get_input_stride(name); + ov::element::Type input_type = ggml_decoder->get_input_type(name); + size_t element_size = input_type.size(); + ov::Shape phys_shape; + static int iter = 0; + if (iter++ % 2 == 0) { + phys_shape = {1, input_shape[1], input_stride[2] / element_size}; + input_tensor = ov::Tensor(ov::element::f32, phys_shape, input_data); + } else { + phys_shape = {1, input_shape[1], input_stride[1] / element_size}; + input_tensor = ov::Tensor(ov::element::f16, phys_shape, input_data); + } } else { input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data); } @@ -105,7 +116,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // Convert InputModel -> ov::Model std::shared_ptr model = front_end->convert(input_model); - ov::save_model(model, "/home/user/zhan/merge_git_commits/llama.cpp-ov/001_model.xml"); + // ov::save_model(model, "/home/user/zhan/merge_git_commits/llama.cpp-ov/001_model.xml"); if (!model) { GGML_LOG_ERROR("Model is not converted \n"); @@ -117,7 +128,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // Loading a model to the device ov::CompiledModel compiled_model = core.compile_model(model); - ov::save_model(compiled_model.get_runtime_model(), "/home/user/zhan/merge_git_commits/llama.cpp-ov/001_compile_model.xml"); + // ov::save_model(compiled_model.get_runtime_model(), "/home/user/zhan/merge_git_commits/llama.cpp-ov/001_compile_model.xml"); // Create infer request ov::InferRequest infer_request = compiled_model.create_infer_request();