Change the input and ouput node shape of MUL_MAT operator

This commit is contained in:
zhanmyz 2025-03-06 01:38:01 +08:00 committed by Mustafa Cavus
parent f98d215162
commit f37fa21a5c
1 changed files with 111 additions and 90 deletions

View File

@ -458,68 +458,72 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) {
const ggml_tensor * src1 = dst->src[1]; // src1 type F32
if(!ggml_is_contiguous(src1) || dst->src[1]->ne[0] * dst->src[1]->nb[0] != dst->src[1]->nb[1]) {
int valid_cols_src0 = dst->src[0]->ne[0];
int num_rows_src0 = dst->src[0]->ne[1];
int batch_src0 = dst->src[0]->ne[2];
int valid_cols_src1 = dst->src[1]->ne[0];
int num_rows_src1 = dst->src[1]->ne[1];
int batch_src1 = dst->src[1]->ne[2];
int row_stride_src0 = dst->src[0]->nb[1] / dst->src[0]->nb[0];
int batch_stride_src0 = dst->src[0]->nb[2] / dst->src[0]->nb[0];
int valid_cols_src0 = src0->ne[0]; // 96
int num_rows_src0 = src0->ne[1]; // 32
int batch_src0 = src0->ne[2]; // 32
int row_stride_src1 = dst->src[1]->nb[1] / dst->src[1]->nb[0];
int batch_stride_src1 = dst->src[1]->nb[2] / dst->src[1]->nb[0];
int valid_cols_src1 = src1->ne[0]; // 96
int num_rows_src1 = src1->ne[1]; // 7
int batch_src1 = src1->ne[2]; // 32
// 对 src0row_stride = nb[1] / nb[0]
int row_stride_src0 = src0->nb[1] / src0->nb[0]; // 6144 / 2 = 3072
int batch_stride_src0 = src0->nb[2] / src0->nb[0]; // 192 / 2 = 96
// 对 src1row_stride = nb[1] / nb[0]
int row_stride_src1 = src1->nb[1] / src1->nb[0]; // 12288 / 4 = 3072
int batch_stride_src1 = src1->nb[2] / src1->nb[0]; // 384 / 4 = 96
std::vector<int64_t> indices_src0 = build_indices(valid_cols_src0, num_rows_src0, batch_src0, row_stride_src0, batch_stride_src0);
std::vector<int64_t> indices_src1 = build_indices(valid_cols_src1, num_rows_src1, batch_src1, row_stride_src1, batch_stride_src1);
// Total number of elements
size_t total_src0 = indices_src0.size(); // = 96 * 32 * 32
size_t total_src1 = indices_src1.size(); // = 96 * 7 * 32
// Treat src0->data and src1->data as 1D tensors
// Note: The total length of physical data should be enough to cover the last valid element index + 1.
// flat shapes:
ov::Shape orig_shape_src0 = { static_cast<size_t>(src0->ne[0]),
static_cast<size_t>(src0->ne[1]),
static_cast<size_t>(src0->ne[2]),
static_cast<size_t>(src0->ne[3]) };
ov::Shape orig_shape_src1 = { static_cast<size_t>(src1->ne[0]),
static_cast<size_t>(src1->ne[1]),
static_cast<size_t>(src1->ne[2]),
static_cast<size_t>(src1->ne[3]) };
auto param_src0 = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, orig_shape_src0);
auto param_src1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, orig_shape_src1);
ov::Shape flat_shape_src0 = { total_src0 };
ov::Shape flat_shape_src1 = { total_src1 };
// Same as above
// ov::Shape flat_shape_src0 = { ggml_nelements(src0) };
// ov::Shape flat_shape_src1 = { ggml_nelements(src1) };
// Create a Parameter node for collecting non-continuous data
auto param_src0 = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, flat_shape_src0);
auto param_src1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, flat_shape_src1);
auto flatten_src0 = std::make_shared<ov::op::v1::Reshape>(
param_src0,
ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{ static_cast<int64_t>(total_src0) }),
false);
auto flatten_src1 = std::make_shared<ov::op::v1::Reshape>(
param_src1,
ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{ static_cast<int64_t>(total_src1) }),
false);
// Create an index Constant node
auto indices_const_src0 = ov::op::v0::Constant::create(ov::element::i64, flat_shape_src0, indices_src0);
auto indices_const_src1 = ov::op::v0::Constant::create(ov::element::i64, flat_shape_src1, indices_src1);
// Use the Gather operator to collect valid data
// axis = 0
auto axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
auto gathered_src0 = std::make_shared<ov::op::v8::Gather>(param_src0, indices_const_src0, axis_const);
auto gathered_src1 = std::make_shared<ov::op::v8::Gather>(param_src1, indices_const_src1, axis_const);
// Reshape to batched form:
// For src0: valid matrix size for each batch [num_rows_src0, valid_cols_src0] = [32,96], total batches = 32,
// Therefore, reshape to 3D Tensor: shape = [32, 32, 96] where first dimension is batch.
auto gathered_src0 = std::make_shared<ov::op::v8::Gather>(flatten_src0, indices_const_src0, axis_const);
auto gathered_src1 = std::make_shared<ov::op::v8::Gather>(flatten_src1, indices_const_src1, axis_const);
std::vector<int64_t> shape_src0_cont = { batch_src0, num_rows_src0, valid_cols_src0 };
auto reshape_src0 = std::make_shared<ov::op::v1::Reshape>(
gathered_src0,
ov::op::v0::Constant::create(ov::element::i64, { shape_src0_cont.size() }, shape_src0_cont),
false);
// For src1: valid matrix size for each batch [num_rows_src1, valid_cols_src1] = [7,96], batch = 32,
// Reshape to 3D Tensor: shape = [32, 7, 96].
std::vector<int64_t> shape_src1_cont = { batch_src1, num_rows_src1, valid_cols_src1 };
auto reshape_src1 = std::make_shared<ov::op::v1::Reshape>(
gathered_src1,
ov::op::v0::Constant::create(ov::element::i64, { shape_src1_cont.size() }, shape_src1_cont),
false);
// For src0, first Convert from F16 to F32
auto src0_f32 = std::make_shared<ov::op::v0::Convert>(reshape_src0, ov::element::f32);
// Use Batched Transpose: swap the last two dimensions, dimension order [0, 2, 1]
auto transpose_order = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{0, 2, 1});
auto src0_transposed = std::make_shared<ov::op::v1::Transpose>(src0_f32, transpose_order);
@ -527,89 +531,105 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) {
auto B = reshape_src1;
auto batched_matmul = std::make_shared<ov::op::v0::MatMul>(B, A, false, false);
// batched_matmul output: shape = [32,7,32]
auto model = std::make_shared<ov::Model>(ov::NodeVector{ batched_matmul },
ov::ParameterVector{ param_src0, param_src1 });
auto model = std::make_shared<ov::Model>(ov::NodeVector{ batched_matmul }, ov::ParameterVector{param_src0, param_src1});
ov::Tensor tensor_src0{ ov::element::f16, orig_shape_src0, src0->data };
ov::Tensor tensor_src1{ ov::element::f32, orig_shape_src1, src1->data };
ov::Shape output_shape = { static_cast<size_t>(dst->ne[0]),
static_cast<size_t>(dst->ne[1]),
static_cast<size_t>(dst->ne[2]) };
ov::Tensor tensor_dst(ov::element::f32, output_shape, dst->data);
ov::Core core;
auto compiled_model = core.compile_model(model, "CPU");
auto infer_request = compiled_model.create_infer_request();
// Construct input Tensors: treat src0->data and src1->data as 1D flat data respectively
ov::Tensor tensor_src0(ov::element::f16, flat_shape_src0, src0->data);
ov::Tensor tensor_src1(ov::element::f32, flat_shape_src1, src1->data);
infer_request.set_input_tensor(0, tensor_src0);
infer_request.set_input_tensor(1, tensor_src1);
ov::Tensor tensor_dst(ov::element::f32, { dst->ne[0], dst->ne[1], dst->ne[2]}, dst->data);
infer_request.set_output_tensor(0, tensor_dst);
infer_request.infer();
return ;
}
// Valid shape
int rank = 0;
if (dst->ne[2] == 1 && dst->ne[3] == 1) {
rank = 2;
} else if (dst->ne[3] == 1) {
rank = 3;
} else {
throw std::runtime_error("Only rank 2 or rank 3 are supported in this implementation.");
}
std::vector<int64_t> eff_shape_src0 = get_effective_shape(src0);
std::vector<int64_t> eff_shape_src1 = get_effective_shape(src1);
std::vector<int64_t> eff_shape_dst = get_effective_shape(dst);
// Determine whether it is batched (effective rank==3) or two-dimensional (rank==2) or one-dimensional (rank==1)
int rank = static_cast<int>(eff_shape_dst.size());
if (rank != 1 && rank != 2 && rank != 3)
throw std::runtime_error("Only rank 1, 2 or 3 supported");
ov::Shape orig_shape_src0 = { static_cast<size_t>(src0->ne[0]),
static_cast<size_t>(src0->ne[1]),
static_cast<size_t>(src0->ne[2]),
static_cast<size_t>(src0->ne[3]) };
ov::Shape orig_shape_src1 = { static_cast<size_t>(src1->ne[0]),
static_cast<size_t>(src1->ne[1]),
static_cast<size_t>(src1->ne[2]),
static_cast<size_t>(src1->ne[3]) };
// Total number of flattened elements
size_t total_src0 = 1; for (auto d : eff_shape_src0) total_src0 *= d;
size_t total_src1 = 1; for (auto d : eff_shape_src1) total_src1 *= d;
ov::Shape flat_shape_src0 = { total_src0 };
ov::Shape flat_shape_src1 = { total_src1 };
// Same as above
// ov::Shape flat_shape_src0 = { ggml_nelements(src0) };
// ov::Shape flat_shape_src1 = { ggml_nelements(src1) };
auto param_flat_src0 = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, flat_shape_src0);
auto param_flat_src1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, flat_shape_src1);
auto param_src0 = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, orig_shape_src0);
auto param_src1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, orig_shape_src1);
auto reshape_src0 = std::make_shared<ov::op::v1::Reshape>(
param_flat_src0,
param_src0,
ov::op::v0::Constant::create(ov::element::i64, { eff_shape_src0.size() }, eff_shape_src0),
false);
auto reshape_src1 = std::make_shared<ov::op::v1::Reshape>(
param_flat_src1,
param_src1,
ov::op::v0::Constant::create(ov::element::i64, { eff_shape_src1.size() }, eff_shape_src1),
false);
// Convert src0: F16 -> F32
auto src0_f32 = std::make_shared<ov::op::v0::Convert>(reshape_src0, ov::element::f32);
// Transpose src0_f32:
// For the 2D case, the shape of reshape_src0 is [3072,9216], and after transposition, it is [9216,3072].
// For the batched case, assuming the shape is [M, K, Batch], batch-wise transposition is required: use order [0, 2, 1].
ov::Output<ov::Node> A_for_mul;
if (rank == 1) {
auto trans_order = ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector<int64_t>{1, 0});
if (rank == 2) {
auto trans_order = ov::op::v0::Constant::create(ov::element::i64, { 2 }, std::vector<int64_t>{1, 0});
A_for_mul = std::make_shared<ov::op::v1::Transpose>(src0_f32, trans_order);
} else if (rank == 2) {
auto trans_order = ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector<int64_t>{1, 0});
A_for_mul = std::make_shared<ov::op::v1::Transpose>(src0_f32, trans_order);
} else { // rank == 3
auto trans_order = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{0, 2, 1});
} else if (rank == 3) {
auto trans_order = ov::op::v0::Constant::create(ov::element::i64, { 3 }, std::vector<int64_t>{0, 2, 1});
A_for_mul = std::make_shared<ov::op::v1::Transpose>(src0_f32, trans_order);
} else {
A_for_mul = src0_f32;
}
auto matmul = std::make_shared<ov::op::v0::MatMul>(reshape_src1, A_for_mul, false, false);
auto matmul_output_shape = matmul->get_output_shape(0);
std::vector<int64_t> final_output_shape;
if (matmul_output_shape.size() == 1) {
final_output_shape = { 1, 1, static_cast<int64_t>(matmul_output_shape[0]) };
} else if (matmul_output_shape.size() == 2) {
final_output_shape = { 1, static_cast<int64_t>(matmul_output_shape[0]), static_cast<int64_t>(matmul_output_shape[1]) };
} else {
final_output_shape = { static_cast<int64_t>(matmul_output_shape[0]), static_cast<int64_t>(matmul_output_shape[1]), static_cast<int64_t>(matmul_output_shape[2]) };
}
auto reshape_output = std::make_shared<ov::op::v1::Reshape>(
matmul,
ov::op::v0::Constant::create(ov::element::i64, {3}, final_output_shape),
false
);
auto model = std::make_shared<ov::Model>(ov::NodeVector{ reshape_output },
ov::ParameterVector{ param_src0, param_src1 });
ov::Tensor tensor_src0{ ov::element::f16, orig_shape_src0, (void *)src0->data };
ov::Tensor tensor_src1{ ov::element::f32, orig_shape_src1, (void *)src1->data };
ov::Shape output_shape = { static_cast<size_t>(dst->ne[2]),
static_cast<size_t>(dst->ne[1]),
static_cast<size_t>(dst->ne[0]) };
ov::Tensor tensor_dst(ov::element::f32, output_shape, dst->data);
ov::Core core;
ov::Tensor tensor_src0{ov::element::f16, flat_shape_src0, (void *)src0->data};
ov::Tensor tensor_src1{ov::element::f32, flat_shape_src1, (void *)src1->data};
ov::Tensor tensor_dst(ov::element::f32, ov::Shape(eff_shape_dst.begin(), eff_shape_dst.end()), dst->data);
std::shared_ptr<ov::op::v0::MatMul> matmul = std::make_shared<ov::op::v0::MatMul>(reshape_src1, A_for_mul, false, false);
auto model = std::make_shared<ov::Model>(ov::NodeVector{matmul}, ov::ParameterVector{param_flat_src0, param_flat_src1});
// ov::save_model(model, "/home/user/zhan/merge_git_commits/llama.cpp-ov/002_backend_mulmat_model.xml");
auto compiled_model = core.compile_model(model, "CPU");
auto infer_request = compiled_model.create_infer_request();
infer_request.set_input_tensor(0, tensor_src0);
infer_request.set_input_tensor(1, tensor_src1);
infer_request.set_output_tensor(0, tensor_dst);
@ -980,22 +1000,22 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe
ggml_backend_openvino_dup_bytes(cgraph->nodes[i]);
} else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) {
ggml_backend_openvino_view(cgraph->nodes[i]);
// } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) {
// ggml_backend_openvino_cpy(cgraph->nodes[i]);
// } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) {
// ggml_backend_openvino_transpose(cgraph->nodes[i]);
} else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) {
ggml_backend_openvino_cpy(cgraph->nodes[i]);
} else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) {
ggml_backend_openvino_transpose(cgraph->nodes[i]);
} else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) {
ggml_backend_openvino_reshape(cgraph->nodes[i]);
// } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) {
// ggml_backend_openvino_mul_mat(cgraph->nodes[i]);
} else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) {
ggml_backend_openvino_mul_mat(cgraph->nodes[i]);
} else {
// Process a range of nodes with openvino_frontend_compute
int start_index = i;
while (i < cgraph->n_nodes
&& std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end()
// && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end()
&& std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end()
&& std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end()
// && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end()
&& std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end()
) {
i++;
}
@ -1228,6 +1248,7 @@ static const std::set<std::string>& openvino_ops = []() -> const std::set<std::s
case GGML_OP_ADD:
return true;
case GGML_OP_MUL:
return false;
case GGML_OP_MUL_MAT:
return false;
case GGML_OP_UNARY: