1. In the Prompt process and predict first token stage, the PERMUTE node needs to be integrated into the OV Frontend

2. In the predict latest token stage, the VIEW, CONT, Reshape need to be integrated into the OV Frontend.
2025-03-15 19:32:40 +08:00 · 2025-03-15 19:32:40 +08:00 · b02265a507
parent 19ec9b6bf5
commit b02265a507
3 changed files with 83 additions and 224 deletions
--- a/ggml/src/ggml-openvino.cpp
+++ b/ggml/src/ggml-openvino.cpp
@ -647,168 +647,6 @@ void ggml_backend_openvino_reshape(ggml_tensor *dst) {
 }

 void ggml_backend_openvino_view(ggml_tensor *dst) {
-
-    /*
-    // Case 1: Set the output tensor shape as the same shape of the input tensor [1, 7, 9216], for next CONT node operator
-    if (dst->ne[0] > dst->ne[1] && (dst->ne[0] * dst->nb[0] != dst->nb[1]) && dst->ne[2] == 1) {
-        // if (dst->view_offs == 0) {
-        //     return;
-        // }
-        ov::Core core;
-        ov::Shape input_shape{ static_cast<size_t>(dst->src[0]->ne[2]), static_cast<size_t>(dst->src[0]->ne[1]), static_cast<size_t>(dst->src[0]->ne[0])};
-        ov::Shape out_shape{ static_cast<size_t>(dst->ne[2]), static_cast<size_t>(dst->ne[1]), static_cast<size_t>(dst->ne[0])};
-
-        auto input_param = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, input_shape);
-
-        // auto new_shape_node = ov::op::v0::Constant::create(ov::element::i64,
-        //     ov::Shape{input_shape.size()},
-        //     std::vector<int64_t>(input_shape.begin(), input_shape.end()));
-        // auto res = std::make_shared<ov::op::v1::Reshape>(input_param, new_shape_node, false);
-
-        int64_t split_addr = dst->view_offs / dst->nb[0];
-        std::vector<int64_t> begin = { 0, 0, split_addr };
-        std::vector<int64_t> end   = { static_cast<int64_t>(dst->src[0]->ne[2]),
-                                        static_cast<int64_t>(dst->src[0]->ne[1]),
-                                        split_addr + static_cast<int64_t>(dst->ne[0]) };
-        std::vector<int64_t> strides = { 1, 1, 1 };
-
-        auto begin_const = ov::op::v0::Constant::create(ov::element::i64, { begin.size() }, begin);
-        auto end_const   = ov::op::v0::Constant::create(ov::element::i64, { end.size() }, end);
-        auto strides_const = ov::op::v0::Constant::create(ov::element::i64, { strides.size() }, strides);
-
-        std::vector<int64_t> begin_mask = {0, 0, 0};
-        std::vector<int64_t> end_mask   = {0, 0, 0};
-        auto slice = std::make_shared<ov::op::v1::StridedSlice>(
-            input_param, 
-            begin_const, 
-            end_const, 
-            strides_const, 
-            begin_mask, 
-            end_mask
-        );
-
-        auto model = std::make_shared<ov::Model>(ov::OutputVector{ slice },
-                                                 ov::ParameterVector{ input_param });
-
-        auto compiled_model = core.compile_model(model, "CPU");
-
-        ov::InferRequest infer_request = compiled_model.create_infer_request();
-
-        ov::Tensor input_tensor(ov::element::f32, input_shape, dst->src[0]->data);
-        infer_request.set_input_tensor(0, input_tensor);
-
-        ov::Tensor output_tensor(ov::element::f32, out_shape, dst->data);
-        infer_request.set_output_tensor(0, output_tensor);
-
-        infer_request.infer();
-    }
-    */
-
-
-    /*
-    // Case 2: Slice contiguous input tensor [98304, 1, 1] to contiguout output tensor [ 21504, 1, 1]
-    if (ggml_is_contiguous(dst) && dst->ne[1] == 1 && (dst->ne[0] * dst->nb[0] == dst->nb[1])) {
-        ov::Core core;
-        ov::Shape input_shape = { static_cast<size_t>(dst->src[0]->ne[2]),
-                                    static_cast<size_t>(dst->src[0]->ne[1]),
-                                    static_cast<size_t>(dst->src[0]->ne[0])};
-        ov::Shape output_shape = { static_cast<size_t>(dst->ne[2]),
-                                    static_cast<size_t>(dst->ne[1]),
-                                    static_cast<size_t>(dst->ne[0])};
-        auto input_param = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, input_shape);
-
-
-        std::vector<int64_t> begin = { 0, 0, 0 };
-        std::vector<int64_t> end   = { static_cast<int64_t>(dst->ne[2]),
-                                        static_cast<int64_t>(dst->ne[1]),
-                                        static_cast<int64_t>(dst->ne[0]) };
-        std::vector<int64_t> strides = { 1, 1, 1 };
-
-        auto begin_const = ov::op::v0::Constant::create(ov::element::i64, { begin.size() }, begin);
-        auto end_const   = ov::op::v0::Constant::create(ov::element::i64, { end.size() }, end);
-        auto strides_const = ov::op::v0::Constant::create(ov::element::i64, { strides.size() }, strides);
-
-        std::vector<int64_t> begin_mask = {0, 0, 0};
-        std::vector<int64_t> end_mask   = {0, 0, 0};
-        auto slice = std::make_shared<ov::op::v1::StridedSlice>(
-            input_param, 
-            begin_const, 
-            end_const, 
-            strides_const, 
-            begin_mask, 
-            end_mask
-        );
-
-        std::shared_ptr<ov::Model> model = std::make_shared<ov::Model>(ov::OutputVector{ slice },
-                                                 ov::ParameterVector{ input_param });
-
-        auto compiled_model = core.compile_model(model, "CPU");
-        ov::InferRequest infer_request = compiled_model.create_infer_request();
-
-        ov::Tensor input_tensor(ov::element::f16, input_shape, dst->src[0]->data);
-        ov::Tensor output_tensor(ov::element::f16, output_shape, dst->data);
-        infer_request.set_input_tensor(0, input_tensor);
-        infer_request.set_output_tensor(0, output_tensor);
-
-        infer_request.infer();
-    }
-    */
-
-    /*
-    // Case 3: Reshape the input tensor [1, 1, 98304] to output tensor [1, 3072, 32](Physical shape)
-    if (dst->ne[0] < dst->ne[1] && dst->ne[2] == 1) {
-        ov::Core core;
-        ov::Shape input_shape = { static_cast<size_t>(dst->src[0]->ne[2]),
-                                    static_cast<size_t>(dst->src[0]->ne[1]),
-                                    static_cast<size_t>(dst->src[0]->ne[0])};
-        ov::Shape output_shape = { static_cast<size_t>(dst->nb[2]),
-                                    static_cast<size_t>(dst->ne[1]),
-                                    static_cast<size_t>(dst->nb[1] / dst->nb[0])};
-        auto input_param = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, input_shape);
-    
-        auto new_shape_node = ov::op::v0::Constant::create(ov::element::i64,
-                                                        ov::Shape{output_shape.size()},
-                                                        std::vector<int64_t>(output_shape.begin(), output_shape.end()));
-        auto res = std::make_shared<ov::op::v1::Reshape>(input_param, new_shape_node, false);
-
-        std::shared_ptr<ov::Model> model = std::make_shared<ov::Model>(ov::OutputVector{res},
-                                                                        ov::ParameterVector{input_param});
-        auto compiled_model = core.compile_model(model, "CPU");
-        ov::InferRequest infer_request = compiled_model.create_infer_request();
-    
-        ov::Tensor input_tensor(ov::element::f16, input_shape, dst->src[0]->data);
-        ov::Tensor output_tensor(ov::element::f16, output_shape, dst->data);
-        infer_request.set_input_tensor(0, input_tensor);
-        infer_request.set_output_tensor(0, output_tensor);
-    
-        infer_request.infer();
-    }
-    */
-
-    /*
-    // Case 4:
-    if (dst->ne[0] != 1 && dst->ne[1] != 1 && dst->ne[2] !=1) {
-        
-    }
-    */
-
-    ov::Core core;
-    ov::Shape input_shape{static_cast<size_t>(dst->src[0]->ne[2]), static_cast<size_t>(dst->src[0]->ne[1]), static_cast<size_t>(dst->src[0]->ne[0])};
-    // ov::Shape output_shape{static_cast<size_t>(dst->ne[2]), static_cast<size_t>(dst->ne[1]), static_cast<size_t>(dst->ne[0])};
-    auto input_param = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, input_shape);
-
-    std::shared_ptr<ov::Model> model = std::make_shared<ov::Model>(ov::OutputVector{input_param},
-                                                                    ov::ParameterVector{input_param});
-    auto compiled_model = core.compile_model(model, "CPU");
-    ov::InferRequest infer_request = compiled_model.create_infer_request();
-
-    ov::Tensor input_tensor(ov::element::f32, input_shape, dst->src[0]->data);
-    // ov::Tensor output_tensor(ov::element::f32, input_shape, dst->data);
-    infer_request.set_input_tensor(0, input_tensor);
-    // infer_request.set_output_tensor(0, output_tensor);
-
-    infer_request.infer();
-
    GGML_UNUSED(dst);
 }

@ -823,7 +661,7 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) {
    const size_t element_size = ggml_type_size(src0->type);

    // Case 1: Both tensors are contiguous
-    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
+    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && (src0->ne[0] * element_size == src0->nb[1])) {
        ov::Shape input_shape = {
            static_cast<size_t>(src0->ne[2]),
            static_cast<size_t>(src0->ne[1]),
@ -1152,6 +990,7 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe
    std::vector<int> permute_indices;

    std::vector<int> mul_mat_indices;
+    std::vector<int> add_indices;

    for (int i = 0; i < cgraph->n_nodes; i++) {
        if (cgraph->nodes[i]->op == GGML_OP_CONT) {
@ -1168,6 +1007,8 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe
            permute_indices.push_back(i);
        } else if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT) {
            mul_mat_indices.push_back(i);
+        } else if (cgraph->nodes[i]->op == GGML_OP_ADD) {
+            add_indices.push_back(i);
        }
    }

@ -1177,48 +1018,49 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe
    bool prompt_process_flag = true;
    if (cgraph->nodes[0]->ne[1] == 1) {
        prompt_process_flag = false;
-    }
-    //     int end_node = cgraph->n_nodes - 1;
-    //     openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag);
-    // } else {
-
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) {
-            ggml_backend_openvino_permute(cgraph->nodes[i]);
-        // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) {
-        //     ggml_backend_openvino_mul_mat(cgraph->nodes[i]);
-        // } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) {
-        //     ggml_backend_openvino_view(cgraph->nodes[i]);
-        // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) {
-        //     ggml_backend_openvino_dup_bytes(cgraph->nodes[i]);
-        } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) {
-            ggml_backend_openvino_transpose(cgraph->nodes[i]);
-        // } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) {
-        //     ggml_backend_openvino_reshape(cgraph->nodes[i]);
-        } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) {
-            ggml_backend_openvino_cpy(cgraph->nodes[i]);
-        } else {
-            // Process a range of nodes with openvino_frontend_compute
-            int start_index = i;
-            while (i < cgraph->n_nodes
-                    && std::find(permute_indices.begin(), permute_indices.end(), i) == permute_indices.end()
-                    // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end()
-                    // && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end()
-                    // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end()
-                    // && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end()
-                    && std::find(transpose_indices.begin(), transpose_indices.end(), i) == transpose_indices.end()
-                    && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end()
-                    ) {
-                i++;
+        // int end_node = cgraph->n_nodes - 1;
+        // openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag);
+        for (int i = 0; i < cgraph->n_nodes; i++) {
+            if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) {
+                ggml_backend_openvino_view(cgraph->nodes[i]);
+            } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) {
+                ggml_backend_openvino_dup_bytes(cgraph->nodes[i]);
+            } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) {
+                ggml_backend_openvino_reshape(cgraph->nodes[i]);
+            } else {
+                // Process a range of nodes with openvino_frontend_compute
+                int start_index = i;
+                while (i < cgraph->n_nodes
+                        && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end()
+                        && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end()
+                        && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end()
+                        ) {
+                    i++;
+                }
+                if (start_index < i) {
+                        openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag);
+                }
            }
-            if (start_index < i) {
-                    openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag);
+        }
+    } else {
+        for (int i = 0; i < cgraph->n_nodes; i++) {
+            if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) {
+                ggml_backend_openvino_permute(cgraph->nodes[i]);
+            } else {
+                // Process a range of nodes with openvino_frontend_compute
+                int start_index = i;
+                while (i < cgraph->n_nodes
+                        && std::find(permute_indices.begin(), permute_indices.end(), i) == permute_indices.end()
+                        ) {
+                    i++;
+                }
+                if (start_index < i) {
+                        openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag);
+                }
            }
        }
    }

-    // }
-
    return GGML_STATUS_SUCCESS;

    GGML_UNUSED(backend);
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@ -26,7 +26,9 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map<std::string, gg
        }
        case GGML_OP_CONT:
        {
-            if (ggml_is_contiguous(node->src[0]) && ggml_is_contiguous(node)) {
+            if (ggml_is_contiguous(node->src[0])
+                && ggml_is_contiguous(node)
+                && (node->src[0]->ne[0] * node->src[0]->nb[0] == node->src[0]->nb[1])) {
                inputs[src0_name] = node->src[0];
                outputs[node_name] = node;
                m_input_names.push_back(src0_name);
@ -112,22 +114,31 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map<std::string, gg
                m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op));
                m_output_names.push_back(node_name);

-                int src0_elem_size = ggml_type_size(node->src[0]->type);
-                int src1_elem_size = ggml_type_size(node->src[1]->type);
+                // int src0_elem_size = ggml_type_size(node->src[0]->type);
+                // int src1_elem_size = ggml_type_size(node->src[1]->type);

-                int src0_logical_rows = node->src[0]->ne[1];
-                int src1_logical_rows = node->src[1]->ne[1];
+                // int src0_logical_rows = node->src[0]->ne[1];
+                // int src1_logical_rows = node->src[1]->ne[1];

-                int src0_phys_cols = node->src[0]->nb[0] / src0_elem_size;
-                int src0_phys_rows = src0_logical_rows;
+                // int src0_phys_cols = node->src[0]->nb[0] / src0_elem_size;
+                // int src0_phys_rows = src0_logical_rows;

-                int src1_phys_cols = node->src[1]->nb[1] / src1_elem_size;
-                int src1_phys_rows = src1_logical_rows;
-                ov::Shape src0_phys_shape = {1, static_cast<size_t>(src0_phys_rows), static_cast<size_t>(src0_phys_cols) };
-                ov::Shape src1_phys_shape = {1, static_cast<size_t>(src1_phys_rows), static_cast<size_t>(src1_phys_cols) };
-                auto input0_param = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, src0_phys_shape);
-                auto input1_param = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, src1_phys_shape);
+                // int src1_phys_cols = node->src[1]->nb[1] / src1_elem_size;
+                // int src1_phys_rows = src1_logical_rows;
+                // ov::Shape src0_phys_shape = {1, static_cast<size_t>(src0_phys_rows), static_cast<size_t>(src0_phys_cols) };
+                // ov::Shape src1_phys_shape = {1, static_cast<size_t>(src1_phys_rows), static_cast<size_t>(src1_phys_cols) };
+                // auto input0_param = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, src0_phys_shape);
+                // auto input1_param = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, src1_phys_shape);
+                // m_params.push_back(input0_param);
+                // m_params.push_back(input1_param);
+
+                ov::Shape input0_shape = { static_cast<size_t>(node->src[0]->ne[2]),
+                    static_cast<size_t>(node->src[0]->ne[1]),
+                    static_cast<size_t>(node->src[0]->ne[0])};
+                auto input0_param = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, input0_shape);
                m_params.push_back(input0_param);
+                ov::Shape input1_shape = { 1, 1, static_cast<size_t>(node->src[1]->nb[2] / node->src[1]->nb[0])};
+                auto input1_param = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, input1_shape);
                m_params.push_back(input1_param);

                m_continuous = false;
@ -147,7 +158,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map<std::string, gg
            // ov::Shape input_shape = { static_cast<size_t>(node->src[0]->ne[2]),
            //                             static_cast<size_t>(node->src[0]->ne[1]),
            //                             static_cast<size_t>(node->src[0]->ne[0])};
-            // auto input_param = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, input_shape);
+            // auto type = get_input_type(src0_name);
+            // auto input_param = std::make_shared<ov::op::v0::Parameter>(type, input_shape);
            // m_params.push_back(input_param);

            // if (node->ne[0] > node->ne[1] && (node->ne[0] * node->nb[0] != node->nb[1]) && node->ne[2] == 1) {
--- a/ggml/src/ggml-openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/utils.cpp
@ -27,12 +27,12 @@ std::vector<std::pair<std::string, ov::Tensor>> get_ggml_graph_input_tensors(std
            printf("Subgraph input %d: %g\n", inp, *(double*)(input_data));
        #endif
        ov::Tensor input_tensor;
-        auto input_shape = ggml_decoder->get_input_shape(name).to_shape();
+        ov::Shape input_shape = ggml_decoder->get_input_shape(name).to_shape();

-        if (flag & op_node_name == "CONT" && input_shape[0] == 1 && input_shape[1] != 1) {
-            std::vector<size_t> input_stride = ggml_decoder->get_input_stride(name);
-            ov::element::Type input_type = ggml_decoder->get_input_type(name);
-            size_t element_size = input_type.size();
+        ov::element::Type input_type = ggml_decoder->get_input_type(name);
+        size_t element_size = input_type.size();
+        std::vector<size_t> input_stride = ggml_decoder->get_input_stride(name);
+        if (op_node_name == "CONT" && input_shape[0] == 1 && (input_shape[1] != 1 && flag || input_shape[2]*element_size!=input_stride[1])) {
            const size_t num_rows    = static_cast<size_t>(ggml_decoder->get_input_shape(name).to_shape()[1]);
            const size_t dim2        = static_cast<size_t>(ggml_decoder->get_input_shape(name).to_shape()[0]);
            size_t phys_stride = static_cast<size_t>(input_stride[1]) / element_size;
@ -42,14 +42,14 @@ std::vector<std::pair<std::string, ov::Tensor>> get_ggml_graph_input_tensors(std
            std::vector<size_t> input_stride = ggml_decoder->get_input_stride(name);
            ov::element::Type input_type = ggml_decoder->get_input_type(name);
            size_t element_size = input_type.size();
-            ov::Shape phys_shape;
+            // ov::Shape phys_shape;
            static int iter = 0;
            if (iter++ % 2 == 0) {
-                phys_shape = {1, input_shape[1], input_stride[2] / element_size};
-                input_tensor = ov::Tensor(ov::element::f32, phys_shape, input_data);
+                // phys_shape = {1, input_shape[1], input_stride[2] / element_size};
+                input_tensor = ov::Tensor(ov::element::f32, input_shape, input_data);
            } else {
-                phys_shape = {1, input_shape[1], input_stride[1] / element_size};
-                input_tensor = ov::Tensor(ov::element::f16, phys_shape, input_data);
+                ov::Shape flat_shape = {1, 1, input_stride[0] / element_size};
+                input_tensor = ov::Tensor(ov::element::f16, flat_shape, input_data);
            }
        } else {
            input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data);
@ -161,6 +161,11 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
        auto output_tensor = infer_request.get_output_tensor(i);
        // output_tensor.get_shape();
        std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size());
+        // std::cout << std::left  << "[ " << std::setw(2) << i << " ]: "
+        //             << "output_names: " << std::setw(20) << output_names[i]
+        //             << " output data: " << std::setw(15) << ((float*)output_tensor.data())[0]
+        //             << std::setw(15) << ((float*)output_tensor.data())[1] << std::right
+        //             << std::endl;
        #ifdef GGML_OPENVINO_DEBUG
            printf("Output %s after: %g\n", output_names[i].c_str(), *(double*)(output_tensor.data()));
        #endif