Change the input and ouput node shape of MUL_MAT operator

2025-03-06 01:38:01 +08:00 · 2025-03-06 01:38:01 +08:00 · f37fa21a5c
parent f98d215162
commit f37fa21a5c
1 changed files with 111 additions and 90 deletions
--- a/ggml/src/ggml-openvino.cpp
+++ b/ggml/src/ggml-openvino.cpp
@ -458,68 +458,72 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) {
    const ggml_tensor * src1 = dst->src[1]; // src1 type F32

    if(!ggml_is_contiguous(src1) || dst->src[1]->ne[0] * dst->src[1]->nb[0] != dst->src[1]->nb[1]) {
-        int valid_cols_src0 = dst->src[0]->ne[0];
-        int num_rows_src0   = dst->src[0]->ne[1];
-        int batch_src0      = dst->src[0]->ne[2];
-        int valid_cols_src1 = dst->src[1]->ne[0];
-        int num_rows_src1   = dst->src[1]->ne[1];
-        int batch_src1      = dst->src[1]->ne[2];
-        int row_stride_src0   = dst->src[0]->nb[1] / dst->src[0]->nb[0];
-        int batch_stride_src0 = dst->src[0]->nb[2] / dst->src[0]->nb[0];
+        int valid_cols_src0 = src0->ne[0];  // 96
+        int num_rows_src0   = src0->ne[1];    // 32
+        int batch_src0      = src0->ne[2];    // 32

-        int row_stride_src1   = dst->src[1]->nb[1] / dst->src[1]->nb[0];
-        int batch_stride_src1 = dst->src[1]->nb[2] / dst->src[1]->nb[0];
+        int valid_cols_src1 = src1->ne[0];  // 96
+        int num_rows_src1   = src1->ne[1];    // 7
+        int batch_src1      = src1->ne[2];    // 32
+
+        // 对 src0：row_stride = nb[1] / nb[0]
+        int row_stride_src0   = src0->nb[1] / src0->nb[0];   // 6144 / 2 = 3072
+        int batch_stride_src0 = src0->nb[2] / src0->nb[0];     // 192 / 2 = 96
+
+        // 对 src1：row_stride = nb[1] / nb[0]
+        int row_stride_src1   = src1->nb[1] / src1->nb[0];   // 12288 / 4 = 3072
+        int batch_stride_src1 = src1->nb[2] / src1->nb[0];     // 384 / 4 = 96

        std::vector<int64_t> indices_src0 = build_indices(valid_cols_src0, num_rows_src0, batch_src0, row_stride_src0, batch_stride_src0);
        std::vector<int64_t> indices_src1 = build_indices(valid_cols_src1, num_rows_src1, batch_src1, row_stride_src1, batch_stride_src1);

-        // Total number of elements
        size_t total_src0 = indices_src0.size(); // = 96 * 32 * 32
        size_t total_src1 = indices_src1.size(); // = 96 * 7 * 32

-        // Treat src0->data and src1->data as 1D tensors
-        // Note: The total length of physical data should be enough to cover the last valid element index + 1.
-        // flat shapes:
+        ov::Shape orig_shape_src0 = { static_cast<size_t>(src0->ne[0]),
+                                        static_cast<size_t>(src0->ne[1]),
+                                        static_cast<size_t>(src0->ne[2]),
+                                        static_cast<size_t>(src0->ne[3]) };
+        ov::Shape orig_shape_src1 = { static_cast<size_t>(src1->ne[0]),
+                                        static_cast<size_t>(src1->ne[1]),
+                                        static_cast<size_t>(src1->ne[2]),
+                                        static_cast<size_t>(src1->ne[3]) };
+
+        auto param_src0 = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, orig_shape_src0);
+        auto param_src1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, orig_shape_src1);
+
        ov::Shape flat_shape_src0 = { total_src0 };
        ov::Shape flat_shape_src1 = { total_src1 };
-        // Same as above
-        // ov::Shape flat_shape_src0 = { ggml_nelements(src0) };
-        // ov::Shape flat_shape_src1 = { ggml_nelements(src1) };

-        // Create a Parameter node for collecting non-continuous data
-        auto param_src0 = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, flat_shape_src0);
-        auto param_src1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, flat_shape_src1);
+        auto flatten_src0 = std::make_shared<ov::op::v1::Reshape>(
+            param_src0,
+            ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{ static_cast<int64_t>(total_src0) }),
+            false);
+        auto flatten_src1 = std::make_shared<ov::op::v1::Reshape>(
+            param_src1,
+            ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{ static_cast<int64_t>(total_src1) }),
+            false);

-        // Create an index Constant node
        auto indices_const_src0 = ov::op::v0::Constant::create(ov::element::i64, flat_shape_src0, indices_src0);
        auto indices_const_src1 = ov::op::v0::Constant::create(ov::element::i64, flat_shape_src1, indices_src1);
-
-        // Use the Gather operator to collect valid data
-        // axis = 0
        auto axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
-        auto gathered_src0 = std::make_shared<ov::op::v8::Gather>(param_src0, indices_const_src0, axis_const);
-        auto gathered_src1 = std::make_shared<ov::op::v8::Gather>(param_src1, indices_const_src1, axis_const);

-        // Reshape to batched form:
-        // For src0: valid matrix size for each batch [num_rows_src0, valid_cols_src0] = [32,96], total batches = 32,
-        // Therefore, reshape to 3D Tensor: shape = [32, 32, 96] where first dimension is batch.
+        auto gathered_src0 = std::make_shared<ov::op::v8::Gather>(flatten_src0, indices_const_src0, axis_const);
+        auto gathered_src1 = std::make_shared<ov::op::v8::Gather>(flatten_src1, indices_const_src1, axis_const);
+
        std::vector<int64_t> shape_src0_cont = { batch_src0, num_rows_src0, valid_cols_src0 };
        auto reshape_src0 = std::make_shared<ov::op::v1::Reshape>(
            gathered_src0,
            ov::op::v0::Constant::create(ov::element::i64, { shape_src0_cont.size() }, shape_src0_cont),
            false);
-        // For src1: valid matrix size for each batch [num_rows_src1, valid_cols_src1] = [7,96], batch = 32,
-        // Reshape to 3D Tensor: shape = [32, 7, 96].
+
        std::vector<int64_t> shape_src1_cont = { batch_src1, num_rows_src1, valid_cols_src1 };
        auto reshape_src1 = std::make_shared<ov::op::v1::Reshape>(
            gathered_src1,
            ov::op::v0::Constant::create(ov::element::i64, { shape_src1_cont.size() }, shape_src1_cont),
            false);

-        // For src0, first Convert from F16 to F32
        auto src0_f32 = std::make_shared<ov::op::v0::Convert>(reshape_src0, ov::element::f32);
-
-        // Use Batched Transpose: swap the last two dimensions, dimension order [0, 2, 1]
        auto transpose_order = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{0, 2, 1});
        auto src0_transposed = std::make_shared<ov::op::v1::Transpose>(src0_f32, transpose_order);

@ -527,89 +531,105 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) {
        auto B = reshape_src1;

        auto batched_matmul = std::make_shared<ov::op::v0::MatMul>(B, A, false, false);
-        // batched_matmul output: shape = [32,7,32]
+        auto model = std::make_shared<ov::Model>(ov::NodeVector{ batched_matmul },
+                                                 ov::ParameterVector{ param_src0, param_src1 });

-        auto model = std::make_shared<ov::Model>(ov::NodeVector{ batched_matmul }, ov::ParameterVector{param_src0, param_src1});
+        ov::Tensor tensor_src0{ ov::element::f16, orig_shape_src0, src0->data };
+        ov::Tensor tensor_src1{ ov::element::f32, orig_shape_src1, src1->data };
+        ov::Shape output_shape = { static_cast<size_t>(dst->ne[0]),
+                                     static_cast<size_t>(dst->ne[1]),
+                                     static_cast<size_t>(dst->ne[2]) };
+        ov::Tensor tensor_dst(ov::element::f32, output_shape, dst->data);

        ov::Core core;
        auto compiled_model = core.compile_model(model, "CPU");
        auto infer_request = compiled_model.create_infer_request();
-
-        // Construct input Tensors: treat src0->data and src1->data as 1D flat data respectively
-        ov::Tensor tensor_src0(ov::element::f16, flat_shape_src0, src0->data);
-        ov::Tensor tensor_src1(ov::element::f32, flat_shape_src1, src1->data);
        infer_request.set_input_tensor(0, tensor_src0);
        infer_request.set_input_tensor(1, tensor_src1);
-
-        ov::Tensor tensor_dst(ov::element::f32, { dst->ne[0], dst->ne[1], dst->ne[2]}, dst->data);
        infer_request.set_output_tensor(0, tensor_dst);
-
        infer_request.infer();
        return ;
    }

-    // Valid shape
+    int rank = 0;
+    if (dst->ne[2] == 1 && dst->ne[3] == 1) {
+        rank = 2;
+    } else if (dst->ne[3] == 1) {
+        rank = 3;
+    } else {
+        throw std::runtime_error("Only rank 2 or rank 3 are supported in this implementation.");
+    }
+
    std::vector<int64_t> eff_shape_src0 = get_effective_shape(src0);
    std::vector<int64_t> eff_shape_src1 = get_effective_shape(src1);
    std::vector<int64_t> eff_shape_dst = get_effective_shape(dst);

-    // Determine whether it is batched (effective rank==3) or two-dimensional (rank==2) or one-dimensional (rank==1)
-    int rank = static_cast<int>(eff_shape_dst.size());
-    if (rank != 1 && rank != 2 && rank != 3)
-        throw std::runtime_error("Only rank 1, 2 or 3 supported");
+    ov::Shape orig_shape_src0 = { static_cast<size_t>(src0->ne[0]),
+                                    static_cast<size_t>(src0->ne[1]),
+                                    static_cast<size_t>(src0->ne[2]),
+                                    static_cast<size_t>(src0->ne[3]) };
+    ov::Shape orig_shape_src1 = { static_cast<size_t>(src1->ne[0]),
+                                    static_cast<size_t>(src1->ne[1]),
+                                    static_cast<size_t>(src1->ne[2]),
+                                    static_cast<size_t>(src1->ne[3]) };

-    // Total number of flattened elements
-    size_t total_src0 = 1; for (auto d : eff_shape_src0) total_src0 *= d;
-    size_t total_src1 = 1; for (auto d : eff_shape_src1) total_src1 *= d;
-
-    ov::Shape flat_shape_src0 = { total_src0 };
-    ov::Shape flat_shape_src1 = { total_src1 };
-    // Same as above
-    // ov::Shape flat_shape_src0 = { ggml_nelements(src0) };
-    // ov::Shape flat_shape_src1 = { ggml_nelements(src1) };
-
-    auto param_flat_src0 = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, flat_shape_src0);
-    auto param_flat_src1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, flat_shape_src1);
+    auto param_src0 = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, orig_shape_src0);
+    auto param_src1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, orig_shape_src1);

    auto reshape_src0 = std::make_shared<ov::op::v1::Reshape>(
-        param_flat_src0,
+        param_src0,
        ov::op::v0::Constant::create(ov::element::i64, { eff_shape_src0.size() }, eff_shape_src0),
        false);
    auto reshape_src1 = std::make_shared<ov::op::v1::Reshape>(
-        param_flat_src1,
+        param_src1,
        ov::op::v0::Constant::create(ov::element::i64, { eff_shape_src1.size() }, eff_shape_src1),
        false);

-    // Convert src0: F16 -> F32
    auto src0_f32 = std::make_shared<ov::op::v0::Convert>(reshape_src0, ov::element::f32);

-    // Transpose src0_f32:
-    // For the 2D case, the shape of reshape_src0 is [3072,9216], and after transposition, it is [9216,3072].
-    // For the batched case, assuming the shape is [M, K, Batch], batch-wise transposition is required: use order [0, 2, 1].
    ov::Output<ov::Node> A_for_mul;
-    if (rank == 1) {
-        auto trans_order = ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector<int64_t>{1, 0});
+    if (rank == 2) {
+        auto trans_order = ov::op::v0::Constant::create(ov::element::i64, { 2 }, std::vector<int64_t>{1, 0});
        A_for_mul = std::make_shared<ov::op::v1::Transpose>(src0_f32, trans_order);
-    } else if (rank == 2) {
-        auto trans_order = ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector<int64_t>{1, 0});
-        A_for_mul = std::make_shared<ov::op::v1::Transpose>(src0_f32, trans_order);
-    } else { // rank == 3
-        auto trans_order = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{0, 2, 1});
+    } else if (rank == 3) {
+        auto trans_order = ov::op::v0::Constant::create(ov::element::i64, { 3 }, std::vector<int64_t>{0, 2, 1});
        A_for_mul = std::make_shared<ov::op::v1::Transpose>(src0_f32, trans_order);
+    } else {
+        A_for_mul = src0_f32;
    }

+    auto matmul = std::make_shared<ov::op::v0::MatMul>(reshape_src1, A_for_mul, false, false);
+
+    auto matmul_output_shape = matmul->get_output_shape(0);
+    std::vector<int64_t> final_output_shape;
+    if (matmul_output_shape.size() == 1) {
+        final_output_shape = { 1, 1, static_cast<int64_t>(matmul_output_shape[0]) };
+    } else if (matmul_output_shape.size() == 2) {
+        final_output_shape = { 1, static_cast<int64_t>(matmul_output_shape[0]), static_cast<int64_t>(matmul_output_shape[1]) };
+    } else {
+        final_output_shape = { static_cast<int64_t>(matmul_output_shape[0]), static_cast<int64_t>(matmul_output_shape[1]), static_cast<int64_t>(matmul_output_shape[2]) };
+    }
+
+    auto reshape_output = std::make_shared<ov::op::v1::Reshape>(
+        matmul,
+        ov::op::v0::Constant::create(ov::element::i64, {3}, final_output_shape),
+        false
+    );
+
+    auto model = std::make_shared<ov::Model>(ov::NodeVector{ reshape_output },
+                                             ov::ParameterVector{ param_src0, param_src1 });
+
+    ov::Tensor tensor_src0{ ov::element::f16, orig_shape_src0, (void *)src0->data };
+    ov::Tensor tensor_src1{ ov::element::f32, orig_shape_src1, (void *)src1->data };
+
+    ov::Shape output_shape = { static_cast<size_t>(dst->ne[2]),
+                                static_cast<size_t>(dst->ne[1]),
+                                static_cast<size_t>(dst->ne[0]) };
+    ov::Tensor tensor_dst(ov::element::f32, output_shape, dst->data);
+
    ov::Core core;
-    ov::Tensor tensor_src0{ov::element::f16, flat_shape_src0, (void *)src0->data};
-    ov::Tensor tensor_src1{ov::element::f32, flat_shape_src1, (void *)src1->data};
-    ov::Tensor tensor_dst(ov::element::f32, ov::Shape(eff_shape_dst.begin(), eff_shape_dst.end()), dst->data);
-
-    std::shared_ptr<ov::op::v0::MatMul> matmul = std::make_shared<ov::op::v0::MatMul>(reshape_src1, A_for_mul, false, false);
-    auto model = std::make_shared<ov::Model>(ov::NodeVector{matmul}, ov::ParameterVector{param_flat_src0, param_flat_src1});
-    // ov::save_model(model, "/home/user/zhan/merge_git_commits/llama.cpp-ov/002_backend_mulmat_model.xml");
-
    auto compiled_model = core.compile_model(model, "CPU");
    auto infer_request = compiled_model.create_infer_request();
-
    infer_request.set_input_tensor(0, tensor_src0);
    infer_request.set_input_tensor(1, tensor_src1);
    infer_request.set_output_tensor(0, tensor_dst);
@ -980,22 +1000,22 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe
           ggml_backend_openvino_dup_bytes(cgraph->nodes[i]);
        } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) {
            ggml_backend_openvino_view(cgraph->nodes[i]);
-        // } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) {
-        //    ggml_backend_openvino_cpy(cgraph->nodes[i]);
-        // } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) {
-        //     ggml_backend_openvino_transpose(cgraph->nodes[i]);
+        } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) {
+           ggml_backend_openvino_cpy(cgraph->nodes[i]);
+        } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) {
+            ggml_backend_openvino_transpose(cgraph->nodes[i]);
        } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) {
            ggml_backend_openvino_reshape(cgraph->nodes[i]);
-        // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) {
-        //     ggml_backend_openvino_mul_mat(cgraph->nodes[i]);
+        } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) {
+            ggml_backend_openvino_mul_mat(cgraph->nodes[i]);
        } else {
            // Process a range of nodes with openvino_frontend_compute
            int start_index = i;
            while (i < cgraph->n_nodes
                    && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end()
-                    // && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end()
+                    && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end()
                    && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end()
-                    // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end()
+                    && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end()
                    ) {
                i++;
            }
@ -1228,6 +1248,7 @@ static const std::set<std::string>& openvino_ops = []() -> const std::set<std::s
        case GGML_OP_ADD:
            return true;
        case GGML_OP_MUL:
+            return false;
        case GGML_OP_MUL_MAT:
            return false;
        case GGML_OP_UNARY: