From ffabe95e2a9ca4c73a718e31ff2c085eea967554 Mon Sep 17 00:00:00 2001 From: Viraj Wadhwa Date: Fri, 9 May 2025 11:37:10 -0700 Subject: [PATCH] Rebase - Bring up to date and fix build process --- docs/build.md | 61 ++ ggml/CMakeLists.txt | 5 + ggml/src/CMakeLists.txt | 1 + ggml/src/ggml-backend-reg.cpp | 7 + ggml/src/ggml-openvino.cpp | 1074 +---------------------- ggml/src/ggml-openvino/CMakeLists.txt | 42 + ggml/src/ggml-openvino/decoder.h | 13 +- ggml/src/ggml-openvino/ggml-decoder.cpp | 38 +- ggml/src/ggml-openvino/ggml-decoder.h | 14 +- ggml/src/ggml-openvino/utils.cpp | 9 +- ggml/src/ggml-openvino/utils.h | 4 +- 11 files changed, 152 insertions(+), 1116 deletions(-) create mode 100644 ggml/src/ggml-openvino/CMakeLists.txt diff --git a/docs/build.md b/docs/build.md index fce9361b2d..3079a91211 100644 --- a/docs/build.md +++ b/docs/build.md @@ -681,6 +681,67 @@ Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/m To read documentation for how to build on IBM Z & LinuxONE, [click here](./build-s390x.md) +## OPENVINO + +### Build openvino-llama + + ```bash + git lfs install --skip-smudge + git clone https://github.com/intel-sandbox/openvino-llama.git -b dev_ggml_frontend + cd openvino-llama + git submodule update --init --recursive + + export OPENVINO_LLAMA_PATH=$(pwd) + + cmake --preset Release + cmake --build build/Release + ``` + +### Build llama.cpp-ov + + ```bash + git clone https://github.com/intel-sandbox/llama.cpp-ov.git -b dev_backend_openvino + cd llama.cpp-ov + + cmake --preset ReleaseOV + cmake --build build/ReleaseOV + ``` + +Download the test model file [Phi-3-mini-4k-instruct-fp16.gguf](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf) from hugging face website. + ``` bash + wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-fp16.gguf?download=true -O ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf + ``` + +Execute the following command to test. + ```bash + export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache + # Currently GGML_OPENVINO_WEIGHT_AS_INPUT has better performance + export GGML_OPENVINO_WEIGHT_AS_INPUT=1 + ./build/ReleaseOV/bin/llama-simple -m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf -n 10 "Hello, my name is " + ``` + +Environment variables: +- GGML_OPENVINO_WEIGHT_AS_INPUT: + Pass the weights as input to the OpenVINO model instead of creating Constant nodes for them. +- GGML_OPENVINO_CACHE_DIR: + If set, model caching in OpenVINO will be used. +- GGML_OPENVINO_DUMP_CGRAPH: + Dumped the compute graph to "cgraph.txt". Note that the the compute graph is different for every token, so the later cgraph will overwrite the previous one. +- GGML_OPENVINO_PROFILING: + Print the time taken for each phase in the OpenVINO backend. +- GGML_OPENVINO_DUMP_IR: + Dump the converted OpenVINO IR. The filenames are timestamps. +- GGML_OPENVINO_DEBUG_INPUT +- GGML_OPENVINO_DEBUG_OUTPUT + +To use Llama.cpp's builtin CPU backend: +```bash +cmake --preset ReleaseCPU +cmake --build build/ReleaseCPU + +./build/ReleaseCPU/bin/llama-simple -m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf -n 10 "Hello, my name is " +``` + ## Notes about GPU-accelerated backends The GPU may still be used to accelerate some parts of the computation even when using the `-ngl 0` option. You can fully disable GPU acceleration by using `--device none`. diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 0176ca1ce9..2fa05ab90c 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -246,6 +246,10 @@ set (GGML_SYCL_TARGET "INTEL" CACHE STRING set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING "ggml: sycl device architecture") +option(GGML_OPENVINO "ggml: use OPENVINO" OFF) +option(GGML_OPENVINO_DEBUG "ggml: enable OPENVINO debugging" OFF) +option(GGML_OV_FRONTEND "ggml: OPENVINO frontend path" ON) + option(GGML_OPENCL "ggml: use OpenCL" OFF) option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increases overhead)" OFF) option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" ON) @@ -324,6 +328,7 @@ set(GGML_PUBLIC_HEADERS include/ggml-vulkan.h include/ggml-webgpu.h include/ggml-zendnn.h + include/ggml-openvino.h include/gguf.h) set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 6192a87046..1758050bae 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -458,6 +458,7 @@ ggml_add_backend(zDNN) ggml_add_backend(OpenCL) ggml_add_backend(Hexagon) ggml_add_backend(ZenDNN) +ggml_add_backend(OPENVINO) foreach (target ggml-base ggml) target_include_directories(${target} PUBLIC $ $) diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 4181a714ad..1f8ae17363 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -77,6 +77,10 @@ #include "ggml-zendnn.h" #endif +#ifdef GGML_USE_OPENVINO +#include "ggml-openvino.h" +#endif + // disable C++17 deprecation warning for std::codecvt_utf8 #if defined(__clang__) # pragma clang diagnostic push @@ -222,6 +226,9 @@ struct ggml_backend_registry { #ifdef GGML_USE_RPC register_backend(ggml_backend_rpc_reg()); #endif +#ifdef GGML_USE_OPENVINO + register_backend(ggml_backend_openvino_reg()); +#endif #ifdef GGML_USE_CPU register_backend(ggml_backend_cpu_reg()); #endif diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 5221a1ff8b..f5d5c7ed67 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -55,1023 +55,8 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph *cgraph) { openvino_frontend_compute(backend, cgraph); - ov::Core core; - - // set the shape and stride of dst - dst->ne[0] = src0->ne[0]; - dst->ne[1] = src0->ne[1]; - dst->nb[0] = src0->nb[0]; - dst->nb[1] = src0->nb[1]; - - if (src0 == nullptr || src1 == nullptr) { - std::cerr << "Error: src0 or src1 is null." << std::endl; - return; - } - - // Step 2: Check that the input tensor types and shapes match - if (src0->type != GGML_TYPE_F32 || src1->type != GGML_TYPE_F32) { - std::cerr << "Error: Unsupported tensor type. Only GGML_TYPE_F32 is supported for OpenVINO." << std::endl; - return; - } - if (src0->ne[0] != src1->ne[0] || src0->ne[1] != src1->ne[1]) { - std::cerr << "Error: src0 and src1 shapes do not match." << std::endl; - return; - } - - ov::Tensor input0 = ov::Tensor(ov::element::f32, {static_cast(src0->ne[0]), static_cast(src0->ne[1])}, src0->data); - ov::Tensor input1 = ov::Tensor(ov::element::f32, {static_cast(src1->ne[0]), static_cast(src1->ne[1])}, src1->data); - - auto input0_param = std::make_shared(ov::element::f32, ov::Shape{static_cast(src0->ne[0]), static_cast(src0->ne[1])}); - auto input1_param = std::make_shared(ov::element::f32, ov::Shape{static_cast(src0->ne[0]), static_cast(src0->ne[1])}); - auto add = std::make_shared(input0_param, input1_param); - auto model = std::make_shared(add, ov::ParameterVector{input0_param, input1_param}); - - // compile model and store in context -#ifdef GGML_OPENVINO_GPU - auto compiled_model = core.compile_model(model, "GPU"); -#elif GGML_OPENVINO_NPU - auto compiled_model = core.compile_model(model, "NPU"); -#else - auto compiled_model = core.compile_model(model, "CPU"); -#endif - // initialize infer request - auto infer_request = compiled_model.create_infer_request(); - - // Step 4: set input data, copy src0 and src1 data to OpenVINO input tensors - infer_request.set_tensor(input0_param, input0); - infer_request.set_tensor(input1_param, input1); - - // Step 5: execute inference - infer_request.infer(); - - // Step 6: get output data - ov::Tensor output = infer_request.get_tensor(compiled_model.output()); - - // // Allocate memory for dst->data if not already allocated - // if (dst->data == nullptr) { - // dst->data = malloc(dst->nb[0] * dst->ne[0]); - // if (dst->data == nullptr) { - // std::cerr << "Error: Failed to allocate memory for dst->data." << std::endl; - // return; - // } - // } - - std::memcpy(dst->data, output.data(), output.get_byte_size()); - - if (dst->ne[0] != src0->ne[0] || dst->ne[1] != src0->ne[1]) { - std::cerr << "Error: dst tensor shape does not match input tensor shape." << std::endl; - return; - } - - // float* dst_data1 = (float*)(dst->data); - // printf("Output data:");; - // for (int i = 0; i < (10 < (int)(dst->ne[0]) ? 10 : (int)(dst->ne[0])); ++i) { - // printf("%f ", dst_data1[i]); - // } - // printf("\n"); - // fflush(stdout); -} - -static void ggml_backend_openvino_mul_forward(ggml_tensor * dst) { - struct ggml_tensor *src0 = dst->src[0]; - struct ggml_tensor *src1 = dst->src[1]; - - ov::Core core; - - // define shape - ov::Shape shape0 = {static_cast(src0->ne[1]), static_cast(src0->ne[0])}; // For Example: [7, 3072] - ov::Shape shape1 = {static_cast(src1->ne[1]), static_cast(src1->ne[0])}; // For Example: [1, 3072] -> broadcast to [7, 3072] - - // create OpenVINO tensor (src0 and src1) - ov::Tensor tensor0(ov::element::f32, shape0, src0->data); - ov::Tensor tensor1(ov::element::f32, shape1, src1->data); - - // define input parameters - auto input0 = std::make_shared(ov::element::f32, shape0); - auto input1 = std::make_shared(ov::element::f32, shape1); - - // create a multiply operation using broadcasting - auto multiply = std::make_shared(input0, input1); - - // create model - auto model = std::make_shared(multiply, ov::ParameterVector{input0, input1}); - // compile model and store in context -#ifdef GGML_OPENVINO_GPU - ov::CompiledModel compiled_model = core.compile_model(model, "GPU"); -#elif GGML_OPENVINO_NPU - ov::CompiledModel compiled_model = core.compile_model(model, "NPU"); -#else - ov::CompiledModel compiled_model = core.compile_model(model, "CPU"); -#endif - - ov::InferRequest infer_request = compiled_model.create_infer_request(); - infer_request.set_tensor(input0, tensor0); - infer_request.set_tensor(input1, tensor1); - - infer_request.infer(); - - // get output tensor and copy it back to dst->data - ov::Tensor output_tensor = infer_request.get_output_tensor(); - std::memcpy(dst->data, output_tensor.data(), src0->ne[0] * src0->ne[1] * sizeof(float)); -} - -static void ggml_backend_openvino_add(ggml_tensor * dst) { - // Placeholder for OpenVINO add operation - // GGML_ASSERT(ctx.device != 0); - GGML_ASSERT(dst->data != nullptr); - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - switch (src0->type) { - case GGML_TYPE_F16: - { - if (src1->type == GGML_TYPE_F16) { - // ggml_backend_openvino_add_forward(ctx, dst, src0, src1); - } else if (src1->type == GGML_TYPE_F32) { - // ggml_compute_forward_add_f16_f32(params, dst); - } else { - GGML_ABORT("fatal error"); - } - } break; - case GGML_TYPE_F32: - { - if (src1->type == GGML_TYPE_F32) { - { - ggml_backend_openvino_add_forward(dst); - } - } - else { - GGML_ABORT("fatal error"); - } - } break; - default: - GGML_ABORT("%s: unsupported type %d\n", __func__, src1->type); - } - -} - -static void ggml_backend_openvino_mul(ggml_tensor * dst) { - GGML_ASSERT(dst->data != nullptr); - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now"); - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_backend_openvino_mul_forward(dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -void ggml_compute_forward_get_rows_f16(struct ggml_tensor *dst) { - const struct ggml_tensor *src0 = dst->src[0]; - const struct ggml_tensor *src1 = dst->src[1]; - - ov::Core core; - - ov::Shape shape0 = {static_cast(src0->ne[1]), static_cast(src0->ne[0])}; // [3072, 7] - ov::Shape shape1 = {static_cast(src1->ne[0])}; // [7] - - ov::Tensor tensor0(ov::element::f16, shape0, src0->data); - ov::Tensor tensor1(ov::element::i32, shape1, src1->data); - - auto input0 = std::make_shared(ov::element::f16, shape0); - auto input1 = std::make_shared(ov::element::i32, shape1); - - auto gather = std::make_shared(input0, input1, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {0})); - - auto model = std::make_shared(gather, ov::ParameterVector{input0, input1}); - ov::CompiledModel compiled_model = core.compile_model(model, "CPU"); - - ov::InferRequest infer_request = compiled_model.create_infer_request(); - infer_request.set_tensor(input0, tensor0); - infer_request.set_tensor(input1, tensor1); - - infer_request.infer(); - - ov::Tensor output_tensor = infer_request.get_output_tensor(); - // Convert output tensor data type from f16 to f32 - ov::Tensor output_tensor_f32 = ov::Tensor(ov::element::f32, output_tensor.get_shape()); - for (size_t i = 0; i < output_tensor.get_size(); ++i) { - output_tensor_f32.data()[i] = static_cast(output_tensor.data()[i]); - } - - // Copy the converted data to dst->data - std::memcpy(dst->data, output_tensor_f32.data(), output_tensor_f32.get_byte_size()); -} - -void ggml_compute_forward_get_rows_f32(struct ggml_tensor *dst) { - const struct ggml_tensor *src0 = dst->src[0]; - const struct ggml_tensor *src1 = dst->src[1]; - - ov::Core core; - - ov::Shape shape0 = {static_cast(src0->ne[1]), static_cast(src0->ne[0])}; // [3072, 7] - ov::Shape shape1 = {static_cast(src1->ne[0])}; // [7] - - ov::Tensor tensor0(ov::element::f32, shape0, src0->data); - ov::Tensor tensor1(ov::element::i32, shape1, src1->data); - - auto input0 = std::make_shared(ov::element::f32, shape0); - auto input1 = std::make_shared(ov::element::i32, shape1); - - auto gather = std::make_shared(input0, input1, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {0})); - - auto model = std::make_shared(gather, ov::ParameterVector{input0, input1}); - ov::CompiledModel compiled_model = core.compile_model(model, "CPU"); - - ov::InferRequest infer_request = compiled_model.create_infer_request(); - infer_request.set_tensor(input0, tensor0); - infer_request.set_tensor(input1, tensor1); - - infer_request.infer(); - - ov::Tensor output_tensor = infer_request.get_output_tensor(); - - // Copy the converted data to dst->data - std::memcpy(dst->data, output_tensor.data(), output_tensor.get_byte_size()); -} - -void ggml_compute_forward_get_rows(struct ggml_tensor *dst) { - const struct ggml_tensor *src0 = dst->src[0]; - const struct ggml_tensor *src1 = dst->src[1]; - - switch (src0->type) { - case GGML_TYPE_F16: - { - ggml_compute_forward_get_rows_f16(dst); - } break; - case GGML_TYPE_F32: - { - ggml_compute_forward_get_rows_f32(dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } - -} - -void ggml_backend_openvino_rms_norm_f32(ggml_tensor *dst) { - const struct ggml_tensor *src0 = dst->src[0]; - assert(src0 != nullptr); - - GGML_ASSERT(ggml_are_same_shape(src0, dst)); - GGML_ASSERT(src0->nb[0] == sizeof(float)); - - const int64_t ne0 = src0->ne[0]; - const int64_t ne1 = src0->ne[1]; - const int64_t ne2 = src0->ne[2]; - - const size_t input_size = ne0 * ne1 * ne2; - - const float *src_data = static_cast(src0->data); - float *dst_data = static_cast(dst->data); - assert(dst_data != nullptr); - - ov::Core core; - - ov::Shape input_shape = {static_cast(ne2), static_cast(ne1), static_cast(ne0)}; - ov::Tensor input_tensor(ov::element::f32, input_shape, const_cast(src_data)); - - auto input_param = std::make_shared( - input_tensor.get_element_type(), - input_tensor.get_shape() - ); - assert(input_param != nullptr && "Input parameter creation failed!"); - - auto square = std::make_shared(input_param, input_param); - auto reduce_sum = std::make_shared( - square, - ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}), - true - ); - - auto mean = std::make_shared( - reduce_sum, - ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {static_cast(ne0)}) - ); - - float eps; - memcpy(&eps, dst->op_params, sizeof(float)); - auto rms = std::make_shared( - std::make_shared( - mean, - ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {eps}) - ) - ); - - auto scale = std::make_shared( - ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {1.0f}), - rms - ); - - auto normalized_input = std::make_shared(input_param, scale); - - ov::ParameterVector parameters = {input_param}; - auto model = std::make_shared(ov::NodeVector{normalized_input}, parameters); - - // static bool model_saved = false; - // if (!model_saved) { - // std::cout << "\n rms model saved" << std::endl; - // ov::save_model(model, "//rms_norm_model.xml"); - // model_saved = true; - // } - - auto compiled_model = core.compile_model(model, "CPU"); - - auto infer_request = compiled_model.create_infer_request(); - - infer_request.set_input_tensor(0, input_tensor); - - infer_request.infer(); - - auto output_tensor = infer_request.get_output_tensor(); - assert(output_tensor.get_size() == input_size); - - std::memcpy(dst_data, output_tensor.data(), input_size * sizeof(float)); -} - -void ggml_backend_openvino_rms_norm(ggml_tensor * dst) { - const struct ggml_tensor * src0 = dst->src[0]; - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_backend_openvino_rms_norm_f32(dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -static void ggml_backend_openvino_permute(const struct ggml_tensor * dst) { - // NOP - GGML_UNUSED(dst); -} - -// Extracting valid shapes -std::vector get_effective_shape(const ggml_tensor * t) { - std::vector shape; - for (int i = 2; i >= 0; i--) { - if (t->ne[i] != 1 || t->ne[2] != 1) - shape.push_back(t->ne[i]); - } - return shape; -} - -/* -* Construct an index vector for Gather to extract non-contiguous data. -* Parameters: -* - valid_cols: number of valid columns per row (e.g., for src0, valid columns = 96) -* - num_rows: number of rows in each batch (e.g., src0: 32 rows per batch) -* - batch: number of batches (e.g., 32) -* - row_stride: physical row length (in elements), e.g., src0: nb[1]/(element_size) = 6144/2 = 3072 -* - batch_stride: physical batch stride (in elements), e.g., src0: nb[2]/(element_size) = 192/2 = 96 -*/ -std::vector build_indices(int valid_cols, int num_rows, int batch, int row_stride, int batch_stride) { - std::vector indices; - indices.reserve(valid_cols * num_rows * batch); - for (int b = 0; b < batch; b++) { - for (int r = 0; r < num_rows; r++) { - for (int c = 0; c < valid_cols; c++) { - // 计算物理索引 = b * batch_stride + r * row_stride + c - indices.push_back(b * batch_stride + r * row_stride + c); - } - } - } - return indices; -} - -void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { - assert(dst && dst->src[0] && dst->src[1]); - const ggml_tensor * src0 = dst->src[0]; // src0 type F16 - const ggml_tensor * src1 = dst->src[1]; // src1 type F32 - - if(!ggml_is_contiguous(src1) || dst->src[1]->ne[0] * dst->src[1]->nb[0] != dst->src[1]->nb[1]) { - int valid_cols_src0 = src0->ne[0]; // 96 - int num_rows_src0 = src0->ne[1]; // 32 - int batch_src0 = src0->ne[2]; // 32 - - int valid_cols_src1 = src1->ne[0]; // 96 - int num_rows_src1 = src1->ne[1]; // 7 - int batch_src1 = src1->ne[2]; // 32 - - // 对 src0:row_stride = nb[1] / nb[0] - int row_stride_src0 = src0->nb[1] / src0->nb[0]; // 6144 / 2 = 3072 - int batch_stride_src0 = src0->nb[2] / src0->nb[0]; // 192 / 2 = 96 - - // 对 src1:row_stride = nb[1] / nb[0] - int row_stride_src1 = src1->nb[1] / src1->nb[0]; // 12288 / 4 = 3072 - int batch_stride_src1 = src1->nb[2] / src1->nb[0]; // 384 / 4 = 96 - - std::vector indices_src0 = build_indices(valid_cols_src0, num_rows_src0, batch_src0, row_stride_src0, batch_stride_src0); - std::vector indices_src1 = build_indices(valid_cols_src1, num_rows_src1, batch_src1, row_stride_src1, batch_stride_src1); - - size_t total_src0 = indices_src0.size(); // = 96 * 32 * 32 - size_t total_src1 = indices_src1.size(); // = 96 * 7 * 32 - - ov::Shape orig_shape_src0 = { static_cast(src0->ne[2]), - static_cast(src0->ne[1]), - static_cast(src0->ne[0])}; - ov::Shape orig_shape_src1 = { static_cast(src1->ne[2]), - static_cast(src1->ne[1]), - static_cast(src1->ne[0])}; - - auto param_src0 = std::make_shared(ov::element::f16, orig_shape_src0); - auto param_src1 = std::make_shared(ov::element::f32, orig_shape_src1); - - ov::Shape flat_shape_src0 = { total_src0 }; - ov::Shape flat_shape_src1 = { total_src1 }; - - auto flatten_src0 = std::make_shared( - param_src0, - ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{ static_cast(total_src0) }), - false); - auto flatten_src1 = std::make_shared( - param_src1, - ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{ static_cast(total_src1) }), - false); - - auto indices_const_src0 = ov::op::v0::Constant::create(ov::element::i64, flat_shape_src0, indices_src0); - auto indices_const_src1 = ov::op::v0::Constant::create(ov::element::i64, flat_shape_src1, indices_src1); - auto axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - - auto gathered_src0 = std::make_shared(flatten_src0, indices_const_src0, axis_const); - auto gathered_src1 = std::make_shared(flatten_src1, indices_const_src1, axis_const); - - std::vector shape_src0_cont = { batch_src0, num_rows_src0, valid_cols_src0 }; - auto reshape_src0 = std::make_shared( - gathered_src0, - ov::op::v0::Constant::create(ov::element::i64, { shape_src0_cont.size() }, shape_src0_cont), - false); - - std::vector shape_src1_cont = { batch_src1, num_rows_src1, valid_cols_src1 }; - auto reshape_src1 = std::make_shared( - gathered_src1, - ov::op::v0::Constant::create(ov::element::i64, { shape_src1_cont.size() }, shape_src1_cont), - false); - - auto src0_f32 = std::make_shared(reshape_src0, ov::element::f32); - auto transpose_order = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{0, 2, 1}); - auto src0_transposed = std::make_shared(src0_f32, transpose_order); - - auto A = src0_transposed; - auto B = reshape_src1; - - auto batched_matmul = std::make_shared(B, A, false, false); - - std::vector final_output_shape = {static_cast(dst->ne[2]), - static_cast(dst->ne[1]), - static_cast(dst->ne[0])}; - - auto reshape_output = std::make_shared( - batched_matmul, - ov::op::v0::Constant::create(ov::element::i64, {3}, final_output_shape), - false); - - auto model = std::make_shared(ov::NodeVector{ reshape_output }, - ov::ParameterVector{ param_src0, param_src1 }); - - ov::Tensor tensor_src0{ ov::element::f16, orig_shape_src0, src0->data }; - ov::Tensor tensor_src1{ ov::element::f32, orig_shape_src1, src1->data }; - ov::Shape output_shape = { static_cast(dst->ne[2]), - static_cast(dst->ne[1]), - static_cast(dst->ne[0]) }; - ov::Tensor tensor_dst(ov::element::f32, output_shape, dst->data); - - ov::Core core; - auto compiled_model = core.compile_model(model, "CPU"); - auto infer_request = compiled_model.create_infer_request(); - infer_request.set_input_tensor(0, tensor_src0); - infer_request.set_input_tensor(1, tensor_src1); - infer_request.set_output_tensor(0, tensor_dst); - infer_request.infer(); - return ; - } - - int rank = 0; - if (dst->ne[2] == 1 && dst->ne[3] == 1) { - rank = 2; - } else if (dst->ne[3] == 1) { - rank = 3; - } else { - throw std::runtime_error("Only rank 2 or rank 3 are supported in this implementation."); - } - - std::vector eff_shape_src0 = get_effective_shape(src0); - std::vector eff_shape_src1 = get_effective_shape(src1); - std::vector eff_shape_dst = get_effective_shape(dst); - - ov::Shape orig_shape_src0 = { static_cast(src0->ne[2]), - static_cast(src0->ne[1]), - static_cast(src0->ne[0])}; - ov::Shape orig_shape_src1 = { static_cast(src1->ne[2]), - static_cast(src1->ne[1]), - static_cast(src1->ne[0])}; - auto param_src0 = std::make_shared(ov::element::f16, orig_shape_src0); - auto param_src1 = std::make_shared(ov::element::f32, orig_shape_src1); - - auto reshape_src0 = std::make_shared( - param_src0, - ov::op::v0::Constant::create(ov::element::i64, { eff_shape_src0.size() }, eff_shape_src0), - false); - auto reshape_src1 = std::make_shared( - param_src1, - ov::op::v0::Constant::create(ov::element::i64, { eff_shape_src1.size() }, eff_shape_src1), - false); - - auto src0_f32 = std::make_shared(reshape_src0, ov::element::f32); - - ov::Output A_for_mul; - if (rank == 2) { - auto trans_order = ov::op::v0::Constant::create(ov::element::i64, { 2 }, std::vector{1, 0}); - A_for_mul = std::make_shared(src0_f32, trans_order); - } else if (rank == 3) { - auto trans_order = ov::op::v0::Constant::create(ov::element::i64, { 3 }, std::vector{0, 2, 1}); - A_for_mul = std::make_shared(src0_f32, trans_order); - } else { - A_for_mul = src0_f32; - } - - auto matmul = std::make_shared(reshape_src1, A_for_mul, false, false); - - auto matmul_output_shape = matmul->get_output_shape(0); - std::vector final_output_shape; - if (matmul_output_shape.size() == 1) { - final_output_shape = { 1, 1, static_cast(matmul_output_shape[0]) }; - } else if (matmul_output_shape.size() == 2) { - final_output_shape = { 1, static_cast(matmul_output_shape[0]), static_cast(matmul_output_shape[1]) }; - } else { - final_output_shape = { static_cast(matmul_output_shape[0]), static_cast(matmul_output_shape[1]), static_cast(matmul_output_shape[2]) }; - } - - auto reshape_output = std::make_shared( - matmul, - ov::op::v0::Constant::create(ov::element::i64, {3}, final_output_shape), - false - ); - - auto model = std::make_shared(ov::NodeVector{ reshape_output }, - ov::ParameterVector{ param_src0, param_src1 }); - - ov::Tensor tensor_src0{ ov::element::f16, orig_shape_src0, (void *)src0->data }; - ov::Tensor tensor_src1{ ov::element::f32, orig_shape_src1, (void *)src1->data }; - - ov::Shape output_shape = { static_cast(dst->ne[2]), - static_cast(dst->ne[1]), - static_cast(dst->ne[0]) }; - ov::Tensor tensor_dst(ov::element::f32, output_shape, dst->data); - - ov::Core core; - auto compiled_model = core.compile_model(model, "CPU"); - auto infer_request = compiled_model.create_infer_request(); - infer_request.set_input_tensor(0, tensor_src0); - infer_request.set_input_tensor(1, tensor_src1); - infer_request.set_output_tensor(0, tensor_dst); - infer_request.infer(); -} - -void ggml_backend_openvino_reshape(ggml_tensor *dst) { - - GGML_UNUSED(dst); -} - -void ggml_backend_openvino_view(ggml_tensor *dst) { - - GGML_UNUSED(dst); -} - -void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { - const struct ggml_tensor *src0 = dst->src[0]; - - // Validate tensor properties - GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); - GGML_ASSERT(src0->type == dst->type); - - // Determine tensor properties - const size_t element_size = ggml_type_size(src0->type); - - // Case 1: Both tensors are contiguous - if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && (src0->ne[0] * element_size == src0->nb[1])) { - ov::Shape input_shape = { - static_cast(src0->ne[2]), - static_cast(src0->ne[1]), - static_cast(src0->ne[0]) - }; - size_t num_elements = 1; - for (auto d : input_shape) { - num_elements *= d; - } - ov::Shape flat_shape = { num_elements }; - - ov::Shape dst_shape = { - static_cast(dst->ne[2]), - static_cast(dst->ne[1]), - static_cast(dst->ne[0]) - }; - - auto input_param = std::make_shared(ov::element::f32, input_shape); - - std::vector flat_shape_vec(flat_shape.begin(), flat_shape.end()); - auto flat_reshape_const = ov::op::v0::Constant::create(ov::element::i64, { flat_shape_vec.size() }, flat_shape_vec); - auto flat_reshape = std::make_shared(input_param, flat_reshape_const, false); - - std::vector dst_shape_vec(dst_shape.begin(), dst_shape.end()); - auto dst_reshape_const = ov::op::v0::Constant::create(ov::element::i64, { dst_shape_vec.size() }, dst_shape_vec); - auto final_reshape = std::make_shared(flat_reshape, dst_reshape_const, false); - - auto model = std::make_shared(ov::OutputVector{ final_reshape }, ov::ParameterVector{ input_param }); - - ov::Core core; - auto compiled_model = core.compile_model(model, "CPU"); - auto infer_request = compiled_model.create_infer_request(); - - ov::Tensor input_tensor(ov::element::f32, input_shape, src0->data); - infer_request.set_input_tensor(0, input_tensor); - - ov::Tensor output_tensor(ov::element::f32, dst_shape, dst->data); - infer_request.set_output_tensor(0, output_tensor); - - infer_request.infer(); - return; - } - - // Case 2: Compatible types, dimensions, and strides - const size_t ne00 = src0->ne[0]; - const size_t ne01 = src0->ne[1]; - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb0 = dst->nb[0]; - - if (src0->type == dst->type && ne00 == dst->ne[0] && nb00 == element_size && nb0 == element_size) { - const size_t valid_elems = static_cast(src0->ne[0]); // 3072 - const size_t num_rows = static_cast(src0->ne[1]); // 7 - const size_t dim2 = static_cast(src0->ne[2]); // 1 - - size_t phys_stride = static_cast(src0->nb[1]) / element_size; // 9216 - // size_t phys_stride = static_cast(src0->ne[0]); // 3072 - - ov::Shape input_shape = { dim2, num_rows, phys_stride }; // 如 {1, 7, 9216 } - ov::Shape logical_shape = { dim2, num_rows, valid_elems }; // {1, 7, 3072} - - // std::cout << "CONT input shape: " << input_shape << std::endl; - auto input_param = std::make_shared(ov::element::f32, input_shape); - - // int64_t split_addr = dst->src[0]->view_offs / dst->src[0]->nb[0]; - // std::vector begin = { 0, 0, split_addr }; - // std::vector end = { static_cast(dim2), - // static_cast(num_rows), - // split_addr + static_cast(valid_elems) }; - - std::vector begin = { 0, 0, 0 }; - std::vector end = { static_cast(dim2), - static_cast(num_rows), - static_cast(valid_elems) }; - std::vector strides = { 1, 1, 1 }; - - auto begin_const = ov::op::v0::Constant::create(ov::element::i64, { begin.size() }, begin); - auto end_const = ov::op::v0::Constant::create(ov::element::i64, { end.size() }, end); - auto strides_const = ov::op::v0::Constant::create(ov::element::i64, { strides.size() }, strides); - - std::vector begin_mask = {0, 0, 0}; - std::vector end_mask = {0, 0, 0}; - auto slice = std::make_shared( - input_param, - begin_const, - end_const, - strides_const, - begin_mask, - end_mask - ); - - auto model = std::make_shared(ov::OutputVector{ slice }, - ov::ParameterVector{ input_param }); - - ov::Core core; - auto compiled_model = core.compile_model(model, "CPU"); - auto infer_request = compiled_model.create_infer_request(); - - //[NOTE]: input_shape should be {1, 7, 9216} not the original shap of src0. - ov::Tensor input_tensor(ov::element::f32, input_shape, src0->data); - infer_request.set_input_tensor(0, input_tensor); - - ov::Tensor output_tensor(ov::element::f32, logical_shape, dst->data); - infer_request.set_output_tensor(0, output_tensor); - - infer_request.infer(); - return; - } - - // Case 3: Non-contiguous source, contiguous destination - // dst->ne =[3072,7,1,1], dst->nb =[4,12288,86016,86016], dst->type=GGML_TYPE_F32 - // dst->src[0]->ne=[96,32,7,1], dst->src[0]->nb=[4,2688,384,86016], dst->src[0]->type=GGML_TYPE_F32 - if (ggml_is_contiguous(dst)) { - size_t valid_i = static_cast(src0->ne[0]); // 96 - size_t valid_j = static_cast(src0->ne[1]); // 32 - size_t valid_k = static_cast(src0->ne[2]); // 7 - - ov::Shape src_shape = { valid_k, valid_j, valid_i }; // {7, 32, 96}; - auto src_param = std::make_shared(ov::element::f32, src_shape); - - ov::Shape input_shape = { valid_j, valid_k, valid_i }; // {32, 7, 96} - auto tmp_param = ov::op::v0::Constant::create(ov::element::i64, { input_shape.size() }, input_shape); - auto input_param = std::make_shared(src_param, tmp_param, false); - - // 添加 Transpose 节点,将 {32,7,96} 变换为 {7,32,96},恢复逻辑顺序 - // 这里交换第 0 与第 1 维,即 permutation = {1, 0, 2} - std::vector order = {1, 0, 2}; - auto order_const = ov::op::v0::Constant::create(ov::element::i64, {order.size()}, order); - auto transpose = std::make_shared(input_param, order_const); - - ov::Shape target_shape = { static_cast(dst->ne[2]), static_cast(dst->ne[1]), static_cast(dst->ne[0]) }; // {1, 7, 3072} - std::vector target_shape_vec = { static_cast(dst->ne[2]), - static_cast(dst->ne[1]), - static_cast(dst->ne[0]) }; - auto reshape_const = ov::op::v0::Constant::create(ov::element::i64, { target_shape_vec.size() }, target_shape_vec); - auto reshaped = std::make_shared(transpose, reshape_const, false); - - auto model = std::make_shared(ov::OutputVector{ reshaped }, - ov::ParameterVector{ src_param }); - ov::Core core; - auto compiled_model = core.compile_model(model, "CPU"); - auto infer_request = compiled_model.create_infer_request(); - - ov::Tensor input_tensor(ov::element::f32, src_shape, src0->data); - infer_request.set_input_tensor(0, input_tensor); - - ov::Tensor output_tensor(ov::element::f32, target_shape, dst->data); - infer_request.set_output_tensor(0, output_tensor); - - infer_request.infer(); - return; - } -} - -static void ggml_backend_openvino_transpose(ggml_tensor *dst) { - // ov::Core core; - // ov::Shape input_shape{static_cast(dst->src[0]->ne[2]), static_cast(dst->src[0]->ne[1]), static_cast(dst->src[0]->ne[0])}; - // ov::Shape output_shape{static_cast(dst->ne[2]), static_cast(dst->ne[1]), static_cast(dst->ne[0])}; - // auto input_param = std::make_shared(ov::element::f32, input_shape); - - // //auto res = std::make_shared(input_param, ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 2, 1})); - - - - // auto new_shape_node = ov::op::v0::Constant::create(ov::element::i64, - // ov::Shape{output_shape.size()}, - // std::vector(output_shape.begin(), output_shape.end())); - // auto res = std::make_shared(input_param, new_shape_node, false); - - - - - // std::shared_ptr model = std::make_shared(ov::OutputVector{res}, - // ov::ParameterVector{input_param}); - // auto compiled_model = core.compile_model(model, "CPU"); - // ov::InferRequest infer_request = compiled_model.create_infer_request(); - - // ov::Tensor input_tensor(ov::element::f32, input_shape, dst->src[0]->data); - // ov::Tensor output_tensor(ov::element::f32, output_shape, dst->data); - // infer_request.set_input_tensor(0, input_tensor); - // infer_request.set_output_tensor(0, output_tensor); - - // infer_request.infer(); - - // NOP - GGML_UNUSED(dst); -} - -void ggml_backend_openvino_cpy(struct ggml_tensor *dst) { - const struct ggml_tensor *src0 = dst->src[0]; - const struct ggml_tensor *src1 = dst->src[1]; - assert(src0 != nullptr); - assert(ggml_nelements(dst) == ggml_nelements(src0)); - - // Extract shapes - ov::Shape src_shape(src0->ne, src0->ne + 4); - ov::Shape dst_shape(dst->ne, dst->ne + 4); - - // Initialize OpenVINO core - ov::Core core; - - // Create OpenVINO parameter for the source tensor - auto src_input = std::make_shared(ov::element::f32, src_shape); - - std::shared_ptr model; - if (ggml_is_contiguous(dst)) { - // Contiguous Case: Flatten src and reshape to dst shape - ov::Shape flattened_shape = {static_cast(ggml_nelements(src0))}; - auto flatten = std::make_shared( - src_input, ov::op::v0::Constant::create(ov::element::i64, {1}, flattened_shape), false); - - auto reshape_to_dst = std::make_shared( - flatten, ov::op::v0::Constant::create(ov::element::i64, {4}, dst_shape), false); - - auto dst_output = std::make_shared(reshape_to_dst, ov::element::f16); - - model = std::make_shared( - ov::ResultVector{std::make_shared(dst_output)}, - ov::ParameterVector{src_input}, - "ContiguousCopy"); - // Compile and execute the model - auto compiled_model = core.compile_model(model, "CPU"); - - ov::Tensor src_tensor(ov::element::f32, src_shape, src0->data); - ov::Tensor dst_tensor(ov::element::f16, dst_shape, dst->data); - - auto infer_request = compiled_model.create_infer_request(); - infer_request.set_input_tensor(0, src_tensor); - infer_request.set_output_tensor(0, dst_tensor); - infer_request.infer(); - } else { - int src0_elem_size = ggml_type_size(src0->type); - int src1_elem_size = ggml_type_size(src1->type); - - int src0_logical_cols = src0->ne[0]; - int src0_logical_rows = src0->ne[1]; - int src1_logical_cols = src1->ne[0]; - int src1_logical_rows = src1->ne[1]; - - int src0_phys_cols = src0->nb[0] / src0_elem_size; - int src0_phys_rows = src0_logical_rows; - - int src1_phys_cols = src1->nb[1] / src1_elem_size; - int src1_phys_rows = src1_logical_rows; - - ov::Shape src0_phys_shape = {1, static_cast(src0_phys_rows), static_cast(src0_phys_cols) }; - ov::Shape src1_phys_shape = {1, static_cast(src1_phys_rows), static_cast(src1_phys_cols) }; - - size_t logical_elems = static_cast(src0_logical_cols * src0_logical_rows); - size_t src_flat_size = 1 * src0_phys_cols * src0_phys_rows; - size_t dst_flat_size = 1 * src1_phys_rows * src1_phys_cols; - - ov::Core core; - - std::vector gather_idx; - gather_idx.reserve(logical_elems); - for (int row = 0; row < src0_logical_rows; row++) { - for (int col = 0; col < src0_logical_cols; col++) { - gather_idx.push_back(static_cast(row + col * src0_phys_rows)); - } - } - ov::Shape gather_idx_shape = { logical_elems }; - - std::vector scatter_idx; - scatter_idx.reserve(logical_elems); - for (int row = 0; row < src1_logical_rows; row++) { - for (int col = 0; col < src1_logical_cols; col++) { - scatter_idx.push_back(static_cast(row * src1_phys_cols + col)); - } - } - ov::Shape scatter_idx_shape = { logical_elems, 1 }; - - auto param_src0 = std::make_shared(ov::element::f32, src0_phys_shape); - auto param_src1 = std::make_shared(ov::element::f16, src1_phys_shape); - - auto src_flat_shape_const = ov::op::v0::Constant::create(ov::element::i64, {1}, - { static_cast(src_flat_size) }); - auto reshape_src = std::make_shared(param_src0, src_flat_shape_const, false); - auto dst_flat_shape_const = ov::op::v0::Constant::create(ov::element::i64, {1}, - { static_cast(dst_flat_size) }); - auto reshape_dst = std::make_shared(param_src1, dst_flat_shape_const, false); - - auto gather_indices_const = ov::op::v0::Constant::create(ov::element::i64, gather_idx_shape, gather_idx); - auto axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto gathered = std::make_shared(reshape_src, gather_indices_const, axis_const); - auto converted = std::make_shared(gathered, ov::element::f16); - - auto scatter_indices_const = ov::op::v0::Constant::create(ov::element::i64, scatter_idx_shape, scatter_idx); - auto scatter = std::make_shared(reshape_dst, scatter_indices_const, converted); - - std::vector dst_phys_shape_vec = {1, static_cast(src1_phys_rows), - static_cast(src1_phys_cols) }; - auto dst_phys_shape_const = ov::op::v0::Constant::create(ov::element::i64, {3}, dst_phys_shape_vec); - auto final_output = std::make_shared(scatter, dst_phys_shape_const, false); - - ov::ParameterVector params = { param_src0, param_src1 }; - auto model = std::make_shared(ov::OutputVector{ final_output }, params); - auto compiled_model = core.compile_model(model, "CPU"); - auto infer_request = compiled_model.create_infer_request(); - - ov::Tensor tensor_src(ov::element::f32, src0_phys_shape, src0->data); - ov::Tensor tensor_dst(ov::element::f16, src1_phys_shape, src1->data); - infer_request.set_input_tensor(0, tensor_src); - infer_request.set_input_tensor(1, tensor_dst); - - ov::Tensor out_tensor(ov::element::f16, src1_phys_shape, dst->data); - infer_request.set_output_tensor(0, out_tensor); - - infer_request.infer(); - } -} - -static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - // Find the indices of GGML_OP_CONT, GGML_OP_CPY nodes, GGML_OP_MUL_MAT and so on. - std::vector cont_indices; - std::vector reshape_indices; - std::vector view_indices; - std::vector view_indices_prompt; - std::vector view_split; - - std::vector cpy_indices; - std::vector cpy_split_16; - std::vector cpy_split_19; - std::vector transpose_indices; - std::vector permute_indices; - - std::vector mul_mat_indices; - std::vector add_indices; - - for (int i = 0; i < cgraph->n_nodes; i++) { - if (cgraph->nodes[i]->op == GGML_OP_CONT) { - cont_indices.push_back(i); - } else if (cgraph->nodes[i]->op == GGML_OP_RESHAPE) { - reshape_indices.push_back(i); - // } else if (cgraph->nodes[i]->op == GGML_OP_VIEW) { - } else if (cgraph->nodes[i]->op == GGML_OP_VIEW) { - // if (cgraph->nodes[i]->src[0]->ne[0] == 98304 && (cgraph->nodes[i]->ne[0] == 3072 || cgraph->nodes[i]->ne[0] == 1)) - // continue; - view_indices.push_back(i); - if (cgraph->nodes[i]->ne[0] == 32) { - view_indices_prompt.push_back(i); - } - if (i == 18) { - view_split.push_back(i); - } - } else if (cgraph->nodes[i]->op == GGML_OP_CPY) { - cpy_indices.push_back(i); - if (i == 16) { - cpy_split_16.push_back(i); - } - if (i == 19) { - cpy_split_19.push_back(i); - } - } else if (cgraph->nodes[i]->op == GGML_OP_TRANSPOSE) { - transpose_indices.push_back(i); - } else if (cgraph->nodes[i]->op == GGML_OP_PERMUTE) { - permute_indices.push_back(i); - } else if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT) { - mul_mat_indices.push_back(i); - } else if (cgraph->nodes[i]->op == GGML_OP_ADD) { - add_indices.push_back(i); - } - } - - - // Process nodes in order - - if (cgraph->nodes[0]->ne[1] == 1) { - for (int i = 0; i < cgraph->n_nodes; i++) { - if (std::find(add_indices.begin(), add_indices.end(), i) != add_indices.end()) { - ggml_backend_openvino_add_forward(cgraph->nodes[i]); - } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { - ggml_backend_openvino_transpose(cgraph->nodes[i]); - } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { - ggml_backend_openvino_cpy(cgraph->nodes[i]); - } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { - ggml_backend_openvino_view(cgraph->nodes[i]); - } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { - ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); - } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { - ggml_backend_openvino_reshape(cgraph->nodes[i]); - } else { - // Process a range of nodes with openvino_frontend_compute - int start_index = i; - while (i < cgraph->n_nodes - && std::find(add_indices.begin(), add_indices.end(), i) == add_indices.end() - && std::find(transpose_indices.begin(), transpose_indices.end(), i) == transpose_indices.end() - && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() - && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() - && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() - && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() - ) { - i++; - } - if (start_index < i) { - openvino_frontend_compute(backend, cgraph, start_index, --i); - } - } - } - } else { - int end_node = cgraph->n_nodes - 1; - openvino_frontend_compute(backend, cgraph, 0, end_node); - } - return GGML_STATUS_SUCCESS; - GGML_UNUSED(backend); - GGML_UNUSED(ctx); } static const ggml_backend_i ggml_backend_openvino_interface = { @@ -1265,53 +250,15 @@ static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_host_ptr(g static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { GGML_ASSERT(dev->reg != nullptr); -#ifdef OPENVINO_OP_DEBUG -static const std::set& openvino_ops = []() -> const std::set& { - static const std::set ops = get_openvino_available_opsets(); - return ops; - }(); - switch (op->op) { - case GGML_OP_NONE: - case GGML_OP_PERMUTE: - case GGML_OP_RESHAPE: - case GGML_OP_TRANSPOSE: - case GGML_OP_VIEW: - return true; - case GGML_OP_ADD: - return true; - case GGML_OP_MUL: - case GGML_OP_MUL_MAT: - return false; - case GGML_OP_UNARY: - switch (ggml_get_unary_op(op)) - { - case GGML_UNARY_OP_SILU: - return true; - case GGML_UNARY_OP_ABS: - case GGML_UNARY_OP_SGN: - case GGML_UNARY_OP_NEG: - case GGML_UNARY_OP_STEP: - case GGML_UNARY_OP_TANH: - case GGML_UNARY_OP_ELU: - case GGML_UNARY_OP_RELU: - case GGML_UNARY_OP_SIGMOID: - case GGML_UNARY_OP_GELU: - case GGML_UNARY_OP_GELU_QUICK: - case GGML_UNARY_OP_HARDSWISH: - case GGML_UNARY_OP_HARDSIGMOID: - case GGML_UNARY_OP_EXP: - case GGML_UNARY_OP_COUNT: - return false; - } - return false; - default: - return false; - } -#else - static const std::set& openvino_ops = []() -> const std::set& { - static const std::set ops = get_openvino_available_opsets(); - return ops; - }(); + static const std::set supported_ops{ + GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, GGML_OP_VIEW, + GGML_OP_CONT, GGML_OP_CPY, GGML_OP_RESHAPE, GGML_OP_PERMUTE, + GGML_OP_TRANSPOSE, GGML_OP_GET_ROWS, GGML_OP_ROPE, GGML_OP_RMS_NORM, + GGML_OP_SCALE, GGML_OP_SOFT_MAX, + }; + static const std::set supported_unary_ops{ + GGML_UNARY_OP_SILU, + }; if (op->op == GGML_OP_UNARY) { return supported_unary_ops.find(ggml_get_unary_op(op)) != @@ -1457,5 +404,4 @@ GGML_API ggml_backend_reg_t ggml_backend_openvino_reg(void) { } return ® -} - +} \ No newline at end of file diff --git a/ggml/src/ggml-openvino/CMakeLists.txt b/ggml/src/ggml-openvino/CMakeLists.txt new file mode 100644 index 0000000000..75b1144843 --- /dev/null +++ b/ggml/src/ggml-openvino/CMakeLists.txt @@ -0,0 +1,42 @@ +find_package(OpenVINO REQUIRED) +list(APPEND GGML_EXTRA_LIBS_PRIVATE openvino::runtime) + +# Set header and libs +file(GLOB GGML_HEADERS_OPENVINO "ggml-openvino/*.h") +list(APPEND GGML_HEADERS_OPENVINO "../include/ggml-openvino.h") +file(GLOB GGML_SOURCES_OPENVINO "ggml-openvino/*.cpp") +list(APPEND GGML_SOURCES_OPENVINO "ggml-openvino.cpp") + +list(APPEND GGML_CDEF_PUBLIC GGML_USE_OPENVINO) + +if (OPENVINO_DEVICE) + if (OPENVINO_DEVICE STREQUAL "GPU") + add_compile_definitions(GGML_OPENVINO_GPU) + elseif (OPENVINO_DEVICE STREQUAL "NPU") + add_compile_definitions(GGML_OPENVINO_NPU) + endif() +endif() + +if(NOT DEFINED GGML_OV_FRONTEND) + set(GGML_OV_FRONTEND OpenVINO_DIR) +endif() +add_definitions(-DGGML_OV_FRONTEND="${GGML_OV_FRONTEND}") + +if (OpenVINO_DIR) + if (GGML_OPENVINO) + if (NOT UNIX) + set(GGML_OPENVINO OFF) + message(WARNING "OpenVINO: OpenVINO toolkit supports unix but not ${CMAKE_SYSTEM_NAME}. Turning off GGML_OPENVINO") + endif() + endif() + + if (GGML_OPENVINO) + if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") + elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64") + else() + set(GGML_OPENVINO OFF) + message(WARNING "OpenVINO: OpenVINO toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}. Turning off GGML_OPENVINO") + endif() + endif() + +endif() diff --git a/ggml/src/ggml-openvino/decoder.h b/ggml/src/ggml-openvino/decoder.h index 790ed2e88d..3404e7c211 100644 --- a/ggml/src/ggml-openvino/decoder.h +++ b/ggml/src/ggml-openvino/decoder.h @@ -1,9 +1,8 @@ #pragma once #include - -#include "openvino/core/node.hpp" -#include "openvino/frontend/decoder.hpp" +#include +#include namespace ov { namespace frontend { @@ -43,11 +42,7 @@ public: virtual std::string& get_output_name(size_t index) const = 0; - virtual size_t get_output_size() const = 0; - - virtual bool is_graph_output(size_t index) const = 0; - - virtual std::string& get_output_name(size_t index) const = 0; + virtual std::vector get_output_names() const = 0; virtual const std::string& get_op_type() const = 0; @@ -65,4 +60,4 @@ public: } // namespace ggml } // namespace frontend -} // namespace ov +} // namespace ov \ No newline at end of file diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 372f880b1d..28409186f8 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -354,7 +354,7 @@ std::vector GgmlOvDecoder::get_shape(const ggml_tensor* tensor) { std::vector GgmlOvDecoder::get_stride(const ggml_tensor* tensor) { std::vector stride; - for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) { + for (int i = GGML_MAX_DIMS - 2; i >= 0; --i) { stride.push_back(static_cast(tensor->nb[i])); } return stride; @@ -448,27 +448,16 @@ void GgmlOvDecoder::visit_subgraph(std::function opTypeMap = { - {GGML_OP_ACC, "GGML_OP_ACC"}, - {GGML_OP_ADD, "GGML_OP_ADD"}, - {GGML_OP_ADD1, "GGML_OP_ADD1"}, - {GGML_OP_CONT, "GGML_OP_CONT"}, - {GGML_OP_CPY, "GGML_OP_CPY"}, - {GGML_OP_DIV, "GGML_OP_DIV"}, - {GGML_OP_DUP, "GGML_OP_DUP"}, - {GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS"}, - {GGML_OP_MUL, "GGML_OP_MUL"}, - {GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT"}, - {GGML_OP_PERMUTE, "GGML_OP_PERMUTE"}, - {GGML_OP_RESHAPE, "GGML_OP_RESHAPE"}, - {GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM"}, - {GGML_OP_ROPE, "GGML_OP_ROPE"}, - {GGML_OP_SCALE, "GGML_OP_SCALE"}, - {GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX"}, - {GGML_OP_SUB, "GGML_OP_SUB"}, - {GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE"}, - {GGML_OP_UNARY, "GGML_OP_UNARY"}, - {GGML_OP_VIEW, "GGML_OP_VIEW"} - }; + {GGML_OP_ACC, "GGML_OP_ACC"}, {GGML_OP_ADD, "GGML_OP_ADD"}, + {GGML_OP_ADD1, "GGML_OP_ADD1"}, {GGML_OP_CONT, "GGML_OP_CONT"}, + {GGML_OP_CPY, "GGML_OP_CPY"}, {GGML_OP_DIV, "GGML_OP_DIV"}, + {GGML_OP_DUP, "GGML_OP_DUP"}, {GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS"}, + {GGML_OP_MUL, "GGML_OP_MUL"}, {GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT"}, + {GGML_OP_PERMUTE, "GGML_OP_PERMUTE"}, {GGML_OP_RESHAPE, "GGML_OP_RESHAPE"}, + {GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM"}, {GGML_OP_ROPE, "GGML_OP_ROPE"}, + {GGML_OP_SCALE, "GGML_OP_SCALE"}, {GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX"}, + {GGML_OP_SUB, "GGML_OP_SUB"}, {GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE"}, + {GGML_OP_UNARY, "GGML_OP_UNARY"}, {GGML_OP_VIEW, "GGML_OP_VIEW"}}; static const std::map unaryOpTypeMap = { {GGML_UNARY_OP_ABS, "GGML_UNARY_OP_ABS"}, {GGML_UNARY_OP_SGN, "GGML_UNARY_OP_SGN"}, @@ -484,8 +473,7 @@ const std::string& GgmlOvDecoder::get_op_type() const { {GGML_UNARY_OP_HARDSWISH, "GGML_UNARY_OP_HARDSWISH"}, {GGML_UNARY_OP_HARDSIGMOID, "GGML_UNARY_OP_HARDSIGMOID"}, {GGML_UNARY_OP_EXP, "GGML_UNARY_OP_EXP"}, - {GGML_UNARY_OP_COUNT, "GGML_UNARY_OP_COUNT"} - }; + {GGML_UNARY_OP_COUNT, "GGML_UNARY_OP_COUNT"}}; auto it = opTypeMap.find(m_node->op); if (it != opTypeMap.end()) { if (it->first == GGML_OP_UNARY) { @@ -498,4 +486,4 @@ const std::string& GgmlOvDecoder::get_op_type() const { } static const std::string unknown_op = "UNKNOWN_OP"; return unknown_op; -} +} \ No newline at end of file diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 22ff9d85f7..a0f6cbea30 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -53,11 +53,7 @@ public: virtual std::string& get_output_name(size_t index) const override; - virtual size_t get_output_size() const override; - - virtual bool is_graph_output(size_t index) const override; - - virtual std::string& get_output_name(size_t index) const override; + virtual std::vector get_output_names() const override; virtual const std::string& get_op_type() const override; @@ -105,10 +101,10 @@ private: void set_max_token_len(); int64_t m_max_token_len; - struct ggml_cgraph * m_cgraph; - std::map m_inputs; + struct ggml_cgraph* m_cgraph; + std::map m_inputs; std::vector m_input_names; - std::map m_outputs; + std::map m_outputs; std::vector m_output_names; ggml_tensor* m_node; std::vector m_nodes; @@ -123,4 +119,4 @@ private: std::vector m_model_output_names; }; -void print_tensor_address_map(const struct ggml_cgraph* cgraph); +void print_tensor_address_map(const struct ggml_cgraph* cgraph); \ No newline at end of file diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 6166161c41..f36700d5ec 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -42,12 +42,7 @@ std::map get_ggml_graph_output_dst(std::shared_ptr& output_dst); + std::map& output_dst); \ No newline at end of file