Rebase - Bring up to date and fix build process

2025-05-09 11:37:10 -07:00 · 2025-05-09 11:37:10 -07:00 · ffabe95e2a
parent a8e5efa44e
commit ffabe95e2a
11 changed files with 152 additions and 1116 deletions
--- a/docs/build.md
+++ b/docs/build.md
@ -681,6 +681,67 @@ Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/m

 To read documentation for how to build on IBM Z & LinuxONE, [click here](./build-s390x.md)

+## OPENVINO
+
+### Build openvino-llama
+
+  ```bash
+  git lfs install --skip-smudge
+  git clone https://github.com/intel-sandbox/openvino-llama.git -b dev_ggml_frontend
+  cd openvino-llama
+  git submodule update --init --recursive
+
+  export OPENVINO_LLAMA_PATH=$(pwd)
+
+  cmake --preset Release
+  cmake --build build/Release
+  ```
+
+### Build llama.cpp-ov
+
+  ```bash
+  git clone https://github.com/intel-sandbox/llama.cpp-ov.git -b dev_backend_openvino
+  cd llama.cpp-ov
+
+  cmake --preset ReleaseOV
+  cmake --build build/ReleaseOV
+  ```
+
+Download the test model file [Phi-3-mini-4k-instruct-fp16.gguf](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf) from hugging face website.
+  ``` bash
+  wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-fp16.gguf?download=true -O  ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf
+  ```
+
+Execute the following command to test.
+  ```bash
+  export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache
+  # Currently GGML_OPENVINO_WEIGHT_AS_INPUT has better performance
+  export GGML_OPENVINO_WEIGHT_AS_INPUT=1
+  ./build/ReleaseOV/bin/llama-simple -m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf -n 10 "Hello, my name is "
+  ```
+
+Environment variables:
+- GGML_OPENVINO_WEIGHT_AS_INPUT:
+    Pass the weights as input to the OpenVINO model instead of creating Constant nodes for them.
+- GGML_OPENVINO_CACHE_DIR:
+    If set, model caching in OpenVINO will be used.
+- GGML_OPENVINO_DUMP_CGRAPH:
+    Dumped the compute graph to "cgraph.txt". Note that the the compute graph is different for every token, so the later cgraph will overwrite the previous one.
+- GGML_OPENVINO_PROFILING:
+    Print the time taken for each phase in the OpenVINO backend.
+- GGML_OPENVINO_DUMP_IR:
+    Dump the converted OpenVINO IR. The filenames are timestamps.
+- GGML_OPENVINO_DEBUG_INPUT
+- GGML_OPENVINO_DEBUG_OUTPUT
+
+To use Llama.cpp's builtin CPU backend:
+```bash
+cmake --preset ReleaseCPU
+cmake --build build/ReleaseCPU
+
+./build/ReleaseCPU/bin/llama-simple -m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf -n 10 "Hello, my name is "
+```
+
 ## Notes about GPU-accelerated backends

 The GPU may still be used to accelerate some parts of the computation even when using the `-ngl 0` option. You can fully disable GPU acceleration by using `--device none`.
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@ -246,6 +246,10 @@ set   (GGML_SYCL_TARGET "INTEL" CACHE STRING
 set   (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
                                            "ggml: sycl device architecture")

+option(GGML_OPENVINO                        "ggml: use OPENVINO"                              OFF)
+option(GGML_OPENVINO_DEBUG                  "ggml: enable OPENVINO debugging"                 OFF)
+option(GGML_OV_FRONTEND                     "ggml: OPENVINO frontend path"                    ON)
+
 option(GGML_OPENCL                          "ggml: use OpenCL"                                OFF)
 option(GGML_OPENCL_PROFILING                "ggml: use OpenCL profiling (increases overhead)" OFF)
 option(GGML_OPENCL_EMBED_KERNELS            "ggml: embed kernels"                             ON)
@ -324,6 +328,7 @@ set(GGML_PUBLIC_HEADERS
    include/ggml-vulkan.h
    include/ggml-webgpu.h
    include/ggml-zendnn.h
+    include/ggml-openvino.h
    include/gguf.h)

 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@ -458,6 +458,7 @@ ggml_add_backend(zDNN)
 ggml_add_backend(OpenCL)
 ggml_add_backend(Hexagon)
 ggml_add_backend(ZenDNN)
+ggml_add_backend(OPENVINO)

 foreach (target ggml-base ggml)
    target_include_directories(${target} PUBLIC    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@ -77,6 +77,10 @@
 #include "ggml-zendnn.h"
 #endif

+#ifdef GGML_USE_OPENVINO
+#include "ggml-openvino.h"
+#endif
+
 // disable C++17 deprecation warning for std::codecvt_utf8
 #if defined(__clang__)
 #    pragma clang diagnostic push
@ -222,6 +226,9 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_RPC
        register_backend(ggml_backend_rpc_reg());
 #endif
+#ifdef GGML_USE_OPENVINO
+        register_backend(ggml_backend_openvino_reg());
+#endif
 #ifdef GGML_USE_CPU
        register_backend(ggml_backend_cpu_reg());
 #endif
--- a/ggml/src/ggml-openvino.cpp
+++ b/ggml/src/ggml-openvino.cpp
--- a/ggml/src/ggml-openvino/CMakeLists.txt
+++ b/ggml/src/ggml-openvino/CMakeLists.txt
@ -0,0 +1,42 @@
+find_package(OpenVINO REQUIRED)
+list(APPEND GGML_EXTRA_LIBS_PRIVATE openvino::runtime)
+
+# Set header and libs
+file(GLOB GGML_HEADERS_OPENVINO "ggml-openvino/*.h")
+list(APPEND GGML_HEADERS_OPENVINO "../include/ggml-openvino.h")
+file(GLOB GGML_SOURCES_OPENVINO "ggml-openvino/*.cpp")
+list(APPEND GGML_SOURCES_OPENVINO "ggml-openvino.cpp")
+
+list(APPEND GGML_CDEF_PUBLIC GGML_USE_OPENVINO)
+
+if (OPENVINO_DEVICE)
+    if (OPENVINO_DEVICE STREQUAL "GPU")
+        add_compile_definitions(GGML_OPENVINO_GPU)
+    elseif (OPENVINO_DEVICE STREQUAL "NPU")
+        add_compile_definitions(GGML_OPENVINO_NPU)
+    endif()
+endif()
+
+if(NOT DEFINED GGML_OV_FRONTEND)
+    set(GGML_OV_FRONTEND OpenVINO_DIR)
+endif()
+add_definitions(-DGGML_OV_FRONTEND="${GGML_OV_FRONTEND}")
+
+if (OpenVINO_DIR)
+    if (GGML_OPENVINO)
+        if (NOT UNIX)
+            set(GGML_OPENVINO OFF)
+            message(WARNING "OpenVINO: OpenVINO toolkit supports unix but not ${CMAKE_SYSTEM_NAME}. Turning off GGML_OPENVINO")
+        endif()
+    endif()
+
+    if (GGML_OPENVINO)
+        if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
+        elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64")
+        else()
+            set(GGML_OPENVINO OFF)
+            message(WARNING "OpenVINO: OpenVINO toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}. Turning off GGML_OPENVINO")
+        endif()
+    endif()
+
+endif()
--- a/ggml/src/ggml-openvino/decoder.h
+++ b/ggml/src/ggml-openvino/decoder.h
@ -1,9 +1,8 @@
 #pragma once

 #include <map>
-
-#include "openvino/core/node.hpp"
-#include "openvino/frontend/decoder.hpp"
+#include <openvino/core/node.hpp>
+#include <openvino/frontend/decoder.hpp>

 namespace ov {
 namespace frontend {
@ -43,11 +42,7 @@ public:

    virtual std::string& get_output_name(size_t index) const = 0;

-    virtual size_t get_output_size() const = 0;
-
-    virtual bool is_graph_output(size_t index) const = 0;
-
-    virtual std::string& get_output_name(size_t index) const = 0;
+    virtual std::vector<std::string> get_output_names() const = 0;

    virtual const std::string& get_op_type() const = 0;

@ -65,4 +60,4 @@ public:

 }  // namespace ggml
 }  // namespace frontend
-}  // namespace ov
+}  // namespace ov
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@ -354,7 +354,7 @@ std::vector<size_t> GgmlOvDecoder::get_shape(const ggml_tensor* tensor) {

 std::vector<size_t> GgmlOvDecoder::get_stride(const ggml_tensor* tensor) {
    std::vector<size_t> stride;
-    for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) {
+    for (int i = GGML_MAX_DIMS - 2; i >= 0; --i) {
        stride.push_back(static_cast<size_t>(tensor->nb[i]));
    }
    return stride;
@ -448,27 +448,16 @@ void GgmlOvDecoder::visit_subgraph(std::function<void(std::shared_ptr<GgmlDecode

 const std::string& GgmlOvDecoder::get_op_type() const {
    static const std::map<ggml_op, std::string> opTypeMap = {
-        {GGML_OP_ACC, "GGML_OP_ACC"},
-        {GGML_OP_ADD, "GGML_OP_ADD"},
-        {GGML_OP_ADD1, "GGML_OP_ADD1"},
-        {GGML_OP_CONT, "GGML_OP_CONT"},
-        {GGML_OP_CPY, "GGML_OP_CPY"},
-        {GGML_OP_DIV, "GGML_OP_DIV"},
-        {GGML_OP_DUP, "GGML_OP_DUP"},
-        {GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS"},
-        {GGML_OP_MUL, "GGML_OP_MUL"},
-        {GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT"},
-        {GGML_OP_PERMUTE, "GGML_OP_PERMUTE"},
-        {GGML_OP_RESHAPE, "GGML_OP_RESHAPE"},
-        {GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM"},
-        {GGML_OP_ROPE, "GGML_OP_ROPE"},
-        {GGML_OP_SCALE, "GGML_OP_SCALE"},
-        {GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX"},
-        {GGML_OP_SUB, "GGML_OP_SUB"},
-        {GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE"},
-        {GGML_OP_UNARY, "GGML_OP_UNARY"},
-        {GGML_OP_VIEW, "GGML_OP_VIEW"}
-    };
+        {GGML_OP_ACC, "GGML_OP_ACC"},           {GGML_OP_ADD, "GGML_OP_ADD"},
+        {GGML_OP_ADD1, "GGML_OP_ADD1"},         {GGML_OP_CONT, "GGML_OP_CONT"},
+        {GGML_OP_CPY, "GGML_OP_CPY"},           {GGML_OP_DIV, "GGML_OP_DIV"},
+        {GGML_OP_DUP, "GGML_OP_DUP"},           {GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS"},
+        {GGML_OP_MUL, "GGML_OP_MUL"},           {GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT"},
+        {GGML_OP_PERMUTE, "GGML_OP_PERMUTE"},   {GGML_OP_RESHAPE, "GGML_OP_RESHAPE"},
+        {GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM"}, {GGML_OP_ROPE, "GGML_OP_ROPE"},
+        {GGML_OP_SCALE, "GGML_OP_SCALE"},       {GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX"},
+        {GGML_OP_SUB, "GGML_OP_SUB"},           {GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE"},
+        {GGML_OP_UNARY, "GGML_OP_UNARY"},       {GGML_OP_VIEW, "GGML_OP_VIEW"}};
    static const std::map<ggml_unary_op, std::string> unaryOpTypeMap = {
        {GGML_UNARY_OP_ABS, "GGML_UNARY_OP_ABS"},
        {GGML_UNARY_OP_SGN, "GGML_UNARY_OP_SGN"},
@ -484,8 +473,7 @@ const std::string& GgmlOvDecoder::get_op_type() const {
        {GGML_UNARY_OP_HARDSWISH, "GGML_UNARY_OP_HARDSWISH"},
        {GGML_UNARY_OP_HARDSIGMOID, "GGML_UNARY_OP_HARDSIGMOID"},
        {GGML_UNARY_OP_EXP, "GGML_UNARY_OP_EXP"},
-        {GGML_UNARY_OP_COUNT, "GGML_UNARY_OP_COUNT"}
-    };
+        {GGML_UNARY_OP_COUNT, "GGML_UNARY_OP_COUNT"}};
    auto it = opTypeMap.find(m_node->op);
    if (it != opTypeMap.end()) {
        if (it->first == GGML_OP_UNARY) {
@ -498,4 +486,4 @@ const std::string& GgmlOvDecoder::get_op_type() const {
    }
    static const std::string unknown_op = "UNKNOWN_OP";
    return unknown_op;
-}
+}
--- a/ggml/src/ggml-openvino/ggml-decoder.h
+++ b/ggml/src/ggml-openvino/ggml-decoder.h
@ -53,11 +53,7 @@ public:

    virtual std::string& get_output_name(size_t index) const override;

-    virtual size_t get_output_size() const override; 
-
-    virtual bool is_graph_output(size_t index) const override;
-
-    virtual std::string& get_output_name(size_t index) const override;
+    virtual std::vector<std::string> get_output_names() const override;

    virtual const std::string& get_op_type() const override;

@ -105,10 +101,10 @@ private:
    void set_max_token_len();
    int64_t m_max_token_len;

-    struct ggml_cgraph * m_cgraph;
-    std::map<std::string, ggml_tensor *> m_inputs;
+    struct ggml_cgraph* m_cgraph;
+    std::map<std::string, ggml_tensor*> m_inputs;
    std::vector<std::string> m_input_names;
-    std::map<std::string, ggml_tensor *> m_outputs;
+    std::map<std::string, ggml_tensor*> m_outputs;
    std::vector<std::string> m_output_names;
    ggml_tensor* m_node;
    std::vector<ggml_tensor*> m_nodes;
@ -123,4 +119,4 @@ private:
    std::vector<std::string> m_model_output_names;
 };

-void print_tensor_address_map(const struct ggml_cgraph* cgraph);
+void print_tensor_address_map(const struct ggml_cgraph* cgraph);
--- a/ggml/src/ggml-openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/utils.cpp
@ -42,12 +42,7 @@ std::map<std::string, void*> get_ggml_graph_output_dst(std::shared_ptr<GgmlOvDec

 static ov::frontend::FrontEnd::Ptr get_ggml_frontend() {
    auto fem = ov::frontend::FrontEndManager();
-    std::string fe_so_path;
-#ifdef GGML_OV_FRONTEND
-    fe_so_path = GGML_OV_FRONTEND;
-#endif
-    fem.register_front_end("ggml", fe_so_path);
-    front_end = fem.load_by_framework("ggml");
+    auto front_end = fem.load_by_framework("ggml");
    return front_end;
 }

@ -204,4 +199,4 @@ void print_output_tensor_info(const std::string& name,
    default:
        break;
    }
-}
+}
--- a/ggml/src/ggml-openvino/utils.h
+++ b/ggml/src/ggml-openvino/utils.h
@ -1,5 +1,5 @@
-#include "ggml-decoder.h"
 #include "ggml-backend-impl.h"
+#include "ggml-decoder.h"

 enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph);

@ -9,4 +9,4 @@ void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor);

 void print_output_tensor_info(const std::string& name,
                              const ov::Tensor& tensor,
-                              std::map<std::string, void*>& output_dst);
+                              std::map<std::string, void*>& output_dst);