From 8c5a609f8da7ca6267796c8ef38f01fe4960e198 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Mon, 16 Dec 2024 11:13:45 +0800 Subject: [PATCH] add the rms_norm operator implemented using OpenVINO to the GGML backend of llama.cpp --- ggml/src/ggml-openvino.cpp | 91 +++++++++++++++++++++++++++++++++++++- 1 file changed, 90 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index efbff646e3..b6f01fdb45 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -324,6 +324,95 @@ void ggml_compute_forward_get_rows(struct ggml_tensor *dst) { } +void ggml_backend_openvino_rms_norm_f32(ggml_tensor *dst) { + const struct ggml_tensor *src0 = dst->src[0]; + assert(src0 != nullptr); + + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + const int64_t ne0 = src0->ne[0]; + const int64_t ne1 = src0->ne[1]; + const int64_t ne2 = src0->ne[2]; + const int64_t ne3 = src0->ne[3]; + + const size_t input_size = ne0 * ne1 * ne2 * ne3; + + const float *src_data = static_cast(src0->data); + float *dst_data = static_cast(dst->data); + assert(dst_data != nullptr); + + ov::Core core; + + ov::Shape input_shape = {static_cast(ne3), static_cast(ne2), + static_cast(ne1), static_cast(ne0)}; + ov::Tensor input_tensor(ov::element::f32, input_shape, const_cast(src_data)); + + auto input_param = std::make_shared( + input_tensor.get_element_type(), + input_tensor.get_shape() + ); + assert(input_param != nullptr && "Input parameter creation failed!"); + + auto square = std::make_shared(input_param, input_param); + auto reduce_sum = std::make_shared( + square, + ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {3}), + true + ); + + auto mean = std::make_shared( + reduce_sum, + ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {static_cast(ne0)}) + ); + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + auto rms = std::make_shared( + std::make_shared( + mean, + ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {eps}) + ) + ); + + auto scale = std::make_shared( + ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {1.0f}), + rms + ); + + auto normalized_input = std::make_shared(input_param, scale); + + ov::ParameterVector parameters = {input_param}; + auto function = std::make_shared(ov::NodeVector{normalized_input}, parameters); + + auto compiled_model = core.compile_model(function, "CPU"); + + auto infer_request = compiled_model.create_infer_request(); + + infer_request.set_input_tensor(0, input_tensor); + + infer_request.infer(); + + auto output_tensor = infer_request.get_output_tensor(); + assert(output_tensor.get_size() == input_size); + + std::memcpy(dst_data, output_tensor.data(), input_size * sizeof(float)); +} + +void ggml_backend_openvino_rms_norm(ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_backend_openvino_rms_norm_f32(dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { openvino_frontend_compute(backend, cgraph); @@ -598,7 +687,7 @@ static const std::set& openvino_ops = []() -> const std::set