Rebase - Bring up to date and fix build process

This commit is contained in:
Viraj Wadhwa 2025-05-09 11:37:10 -07:00 committed by Mustafa Cavus
parent a8e5efa44e
commit ffabe95e2a
11 changed files with 152 additions and 1116 deletions

View File

@ -681,6 +681,67 @@ Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/m
To read documentation for how to build on IBM Z & LinuxONE, [click here](./build-s390x.md)
## OPENVINO
### Build openvino-llama
```bash
git lfs install --skip-smudge
git clone https://github.com/intel-sandbox/openvino-llama.git -b dev_ggml_frontend
cd openvino-llama
git submodule update --init --recursive
export OPENVINO_LLAMA_PATH=$(pwd)
cmake --preset Release
cmake --build build/Release
```
### Build llama.cpp-ov
```bash
git clone https://github.com/intel-sandbox/llama.cpp-ov.git -b dev_backend_openvino
cd llama.cpp-ov
cmake --preset ReleaseOV
cmake --build build/ReleaseOV
```
Download the test model file [Phi-3-mini-4k-instruct-fp16.gguf](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf) from hugging face website.
``` bash
wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-fp16.gguf?download=true -O ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf
```
Execute the following command to test.
```bash
export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache
# Currently GGML_OPENVINO_WEIGHT_AS_INPUT has better performance
export GGML_OPENVINO_WEIGHT_AS_INPUT=1
./build/ReleaseOV/bin/llama-simple -m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf -n 10 "Hello, my name is "
```
Environment variables:
- GGML_OPENVINO_WEIGHT_AS_INPUT:
Pass the weights as input to the OpenVINO model instead of creating Constant nodes for them.
- GGML_OPENVINO_CACHE_DIR:
If set, model caching in OpenVINO will be used.
- GGML_OPENVINO_DUMP_CGRAPH:
Dumped the compute graph to "cgraph.txt". Note that the the compute graph is different for every token, so the later cgraph will overwrite the previous one.
- GGML_OPENVINO_PROFILING:
Print the time taken for each phase in the OpenVINO backend.
- GGML_OPENVINO_DUMP_IR:
Dump the converted OpenVINO IR. The filenames are timestamps.
- GGML_OPENVINO_DEBUG_INPUT
- GGML_OPENVINO_DEBUG_OUTPUT
To use Llama.cpp's builtin CPU backend:
```bash
cmake --preset ReleaseCPU
cmake --build build/ReleaseCPU
./build/ReleaseCPU/bin/llama-simple -m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf -n 10 "Hello, my name is "
```
## Notes about GPU-accelerated backends
The GPU may still be used to accelerate some parts of the computation even when using the `-ngl 0` option. You can fully disable GPU acceleration by using `--device none`.

View File

@ -246,6 +246,10 @@ set (GGML_SYCL_TARGET "INTEL" CACHE STRING
set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
"ggml: sycl device architecture")
option(GGML_OPENVINO "ggml: use OPENVINO" OFF)
option(GGML_OPENVINO_DEBUG "ggml: enable OPENVINO debugging" OFF)
option(GGML_OV_FRONTEND "ggml: OPENVINO frontend path" ON)
option(GGML_OPENCL "ggml: use OpenCL" OFF)
option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increases overhead)" OFF)
option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" ON)
@ -324,6 +328,7 @@ set(GGML_PUBLIC_HEADERS
include/ggml-vulkan.h
include/ggml-webgpu.h
include/ggml-zendnn.h
include/ggml-openvino.h
include/gguf.h)
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")

View File

@ -458,6 +458,7 @@ ggml_add_backend(zDNN)
ggml_add_backend(OpenCL)
ggml_add_backend(Hexagon)
ggml_add_backend(ZenDNN)
ggml_add_backend(OPENVINO)
foreach (target ggml-base ggml)
target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)

View File

@ -77,6 +77,10 @@
#include "ggml-zendnn.h"
#endif
#ifdef GGML_USE_OPENVINO
#include "ggml-openvino.h"
#endif
// disable C++17 deprecation warning for std::codecvt_utf8
#if defined(__clang__)
# pragma clang diagnostic push
@ -222,6 +226,9 @@ struct ggml_backend_registry {
#ifdef GGML_USE_RPC
register_backend(ggml_backend_rpc_reg());
#endif
#ifdef GGML_USE_OPENVINO
register_backend(ggml_backend_openvino_reg());
#endif
#ifdef GGML_USE_CPU
register_backend(ggml_backend_cpu_reg());
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,42 @@
find_package(OpenVINO REQUIRED)
list(APPEND GGML_EXTRA_LIBS_PRIVATE openvino::runtime)
# Set header and libs
file(GLOB GGML_HEADERS_OPENVINO "ggml-openvino/*.h")
list(APPEND GGML_HEADERS_OPENVINO "../include/ggml-openvino.h")
file(GLOB GGML_SOURCES_OPENVINO "ggml-openvino/*.cpp")
list(APPEND GGML_SOURCES_OPENVINO "ggml-openvino.cpp")
list(APPEND GGML_CDEF_PUBLIC GGML_USE_OPENVINO)
if (OPENVINO_DEVICE)
if (OPENVINO_DEVICE STREQUAL "GPU")
add_compile_definitions(GGML_OPENVINO_GPU)
elseif (OPENVINO_DEVICE STREQUAL "NPU")
add_compile_definitions(GGML_OPENVINO_NPU)
endif()
endif()
if(NOT DEFINED GGML_OV_FRONTEND)
set(GGML_OV_FRONTEND OpenVINO_DIR)
endif()
add_definitions(-DGGML_OV_FRONTEND="${GGML_OV_FRONTEND}")
if (OpenVINO_DIR)
if (GGML_OPENVINO)
if (NOT UNIX)
set(GGML_OPENVINO OFF)
message(WARNING "OpenVINO: OpenVINO toolkit supports unix but not ${CMAKE_SYSTEM_NAME}. Turning off GGML_OPENVINO")
endif()
endif()
if (GGML_OPENVINO)
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64")
else()
set(GGML_OPENVINO OFF)
message(WARNING "OpenVINO: OpenVINO toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}. Turning off GGML_OPENVINO")
endif()
endif()
endif()

View File

@ -1,9 +1,8 @@
#pragma once
#include <map>
#include "openvino/core/node.hpp"
#include "openvino/frontend/decoder.hpp"
#include <openvino/core/node.hpp>
#include <openvino/frontend/decoder.hpp>
namespace ov {
namespace frontend {
@ -43,11 +42,7 @@ public:
virtual std::string& get_output_name(size_t index) const = 0;
virtual size_t get_output_size() const = 0;
virtual bool is_graph_output(size_t index) const = 0;
virtual std::string& get_output_name(size_t index) const = 0;
virtual std::vector<std::string> get_output_names() const = 0;
virtual const std::string& get_op_type() const = 0;
@ -65,4 +60,4 @@ public:
} // namespace ggml
} // namespace frontend
} // namespace ov
} // namespace ov

View File

@ -354,7 +354,7 @@ std::vector<size_t> GgmlOvDecoder::get_shape(const ggml_tensor* tensor) {
std::vector<size_t> GgmlOvDecoder::get_stride(const ggml_tensor* tensor) {
std::vector<size_t> stride;
for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) {
for (int i = GGML_MAX_DIMS - 2; i >= 0; --i) {
stride.push_back(static_cast<size_t>(tensor->nb[i]));
}
return stride;
@ -448,27 +448,16 @@ void GgmlOvDecoder::visit_subgraph(std::function<void(std::shared_ptr<GgmlDecode
const std::string& GgmlOvDecoder::get_op_type() const {
static const std::map<ggml_op, std::string> opTypeMap = {
{GGML_OP_ACC, "GGML_OP_ACC"},
{GGML_OP_ADD, "GGML_OP_ADD"},
{GGML_OP_ADD1, "GGML_OP_ADD1"},
{GGML_OP_CONT, "GGML_OP_CONT"},
{GGML_OP_CPY, "GGML_OP_CPY"},
{GGML_OP_DIV, "GGML_OP_DIV"},
{GGML_OP_DUP, "GGML_OP_DUP"},
{GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS"},
{GGML_OP_MUL, "GGML_OP_MUL"},
{GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT"},
{GGML_OP_PERMUTE, "GGML_OP_PERMUTE"},
{GGML_OP_RESHAPE, "GGML_OP_RESHAPE"},
{GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM"},
{GGML_OP_ROPE, "GGML_OP_ROPE"},
{GGML_OP_SCALE, "GGML_OP_SCALE"},
{GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX"},
{GGML_OP_SUB, "GGML_OP_SUB"},
{GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE"},
{GGML_OP_UNARY, "GGML_OP_UNARY"},
{GGML_OP_VIEW, "GGML_OP_VIEW"}
};
{GGML_OP_ACC, "GGML_OP_ACC"}, {GGML_OP_ADD, "GGML_OP_ADD"},
{GGML_OP_ADD1, "GGML_OP_ADD1"}, {GGML_OP_CONT, "GGML_OP_CONT"},
{GGML_OP_CPY, "GGML_OP_CPY"}, {GGML_OP_DIV, "GGML_OP_DIV"},
{GGML_OP_DUP, "GGML_OP_DUP"}, {GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS"},
{GGML_OP_MUL, "GGML_OP_MUL"}, {GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT"},
{GGML_OP_PERMUTE, "GGML_OP_PERMUTE"}, {GGML_OP_RESHAPE, "GGML_OP_RESHAPE"},
{GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM"}, {GGML_OP_ROPE, "GGML_OP_ROPE"},
{GGML_OP_SCALE, "GGML_OP_SCALE"}, {GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX"},
{GGML_OP_SUB, "GGML_OP_SUB"}, {GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE"},
{GGML_OP_UNARY, "GGML_OP_UNARY"}, {GGML_OP_VIEW, "GGML_OP_VIEW"}};
static const std::map<ggml_unary_op, std::string> unaryOpTypeMap = {
{GGML_UNARY_OP_ABS, "GGML_UNARY_OP_ABS"},
{GGML_UNARY_OP_SGN, "GGML_UNARY_OP_SGN"},
@ -484,8 +473,7 @@ const std::string& GgmlOvDecoder::get_op_type() const {
{GGML_UNARY_OP_HARDSWISH, "GGML_UNARY_OP_HARDSWISH"},
{GGML_UNARY_OP_HARDSIGMOID, "GGML_UNARY_OP_HARDSIGMOID"},
{GGML_UNARY_OP_EXP, "GGML_UNARY_OP_EXP"},
{GGML_UNARY_OP_COUNT, "GGML_UNARY_OP_COUNT"}
};
{GGML_UNARY_OP_COUNT, "GGML_UNARY_OP_COUNT"}};
auto it = opTypeMap.find(m_node->op);
if (it != opTypeMap.end()) {
if (it->first == GGML_OP_UNARY) {
@ -498,4 +486,4 @@ const std::string& GgmlOvDecoder::get_op_type() const {
}
static const std::string unknown_op = "UNKNOWN_OP";
return unknown_op;
}
}

View File

@ -53,11 +53,7 @@ public:
virtual std::string& get_output_name(size_t index) const override;
virtual size_t get_output_size() const override;
virtual bool is_graph_output(size_t index) const override;
virtual std::string& get_output_name(size_t index) const override;
virtual std::vector<std::string> get_output_names() const override;
virtual const std::string& get_op_type() const override;
@ -105,10 +101,10 @@ private:
void set_max_token_len();
int64_t m_max_token_len;
struct ggml_cgraph * m_cgraph;
std::map<std::string, ggml_tensor *> m_inputs;
struct ggml_cgraph* m_cgraph;
std::map<std::string, ggml_tensor*> m_inputs;
std::vector<std::string> m_input_names;
std::map<std::string, ggml_tensor *> m_outputs;
std::map<std::string, ggml_tensor*> m_outputs;
std::vector<std::string> m_output_names;
ggml_tensor* m_node;
std::vector<ggml_tensor*> m_nodes;
@ -123,4 +119,4 @@ private:
std::vector<std::string> m_model_output_names;
};
void print_tensor_address_map(const struct ggml_cgraph* cgraph);
void print_tensor_address_map(const struct ggml_cgraph* cgraph);

View File

@ -42,12 +42,7 @@ std::map<std::string, void*> get_ggml_graph_output_dst(std::shared_ptr<GgmlOvDec
static ov::frontend::FrontEnd::Ptr get_ggml_frontend() {
auto fem = ov::frontend::FrontEndManager();
std::string fe_so_path;
#ifdef GGML_OV_FRONTEND
fe_so_path = GGML_OV_FRONTEND;
#endif
fem.register_front_end("ggml", fe_so_path);
front_end = fem.load_by_framework("ggml");
auto front_end = fem.load_by_framework("ggml");
return front_end;
}
@ -204,4 +199,4 @@ void print_output_tensor_info(const std::string& name,
default:
break;
}
}
}

View File

@ -1,5 +1,5 @@
#include "ggml-decoder.h"
#include "ggml-backend-impl.h"
#include "ggml-decoder.h"
enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph);
@ -9,4 +9,4 @@ void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor);
void print_output_tensor_info(const std::string& name,
const ov::Tensor& tensor,
std::map<std::string, void*>& output_dst);
std::map<std::string, void*>& output_dst);