Rebase - Bring up to date and fix build process
This commit is contained in:
parent
a8e5efa44e
commit
ffabe95e2a
|
|
@ -681,6 +681,67 @@ Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/m
|
|||
|
||||
To read documentation for how to build on IBM Z & LinuxONE, [click here](./build-s390x.md)
|
||||
|
||||
## OPENVINO
|
||||
|
||||
### Build openvino-llama
|
||||
|
||||
```bash
|
||||
git lfs install --skip-smudge
|
||||
git clone https://github.com/intel-sandbox/openvino-llama.git -b dev_ggml_frontend
|
||||
cd openvino-llama
|
||||
git submodule update --init --recursive
|
||||
|
||||
export OPENVINO_LLAMA_PATH=$(pwd)
|
||||
|
||||
cmake --preset Release
|
||||
cmake --build build/Release
|
||||
```
|
||||
|
||||
### Build llama.cpp-ov
|
||||
|
||||
```bash
|
||||
git clone https://github.com/intel-sandbox/llama.cpp-ov.git -b dev_backend_openvino
|
||||
cd llama.cpp-ov
|
||||
|
||||
cmake --preset ReleaseOV
|
||||
cmake --build build/ReleaseOV
|
||||
```
|
||||
|
||||
Download the test model file [Phi-3-mini-4k-instruct-fp16.gguf](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf) from hugging face website.
|
||||
``` bash
|
||||
wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-fp16.gguf?download=true -O ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf
|
||||
```
|
||||
|
||||
Execute the following command to test.
|
||||
```bash
|
||||
export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache
|
||||
# Currently GGML_OPENVINO_WEIGHT_AS_INPUT has better performance
|
||||
export GGML_OPENVINO_WEIGHT_AS_INPUT=1
|
||||
./build/ReleaseOV/bin/llama-simple -m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf -n 10 "Hello, my name is "
|
||||
```
|
||||
|
||||
Environment variables:
|
||||
- GGML_OPENVINO_WEIGHT_AS_INPUT:
|
||||
Pass the weights as input to the OpenVINO model instead of creating Constant nodes for them.
|
||||
- GGML_OPENVINO_CACHE_DIR:
|
||||
If set, model caching in OpenVINO will be used.
|
||||
- GGML_OPENVINO_DUMP_CGRAPH:
|
||||
Dumped the compute graph to "cgraph.txt". Note that the the compute graph is different for every token, so the later cgraph will overwrite the previous one.
|
||||
- GGML_OPENVINO_PROFILING:
|
||||
Print the time taken for each phase in the OpenVINO backend.
|
||||
- GGML_OPENVINO_DUMP_IR:
|
||||
Dump the converted OpenVINO IR. The filenames are timestamps.
|
||||
- GGML_OPENVINO_DEBUG_INPUT
|
||||
- GGML_OPENVINO_DEBUG_OUTPUT
|
||||
|
||||
To use Llama.cpp's builtin CPU backend:
|
||||
```bash
|
||||
cmake --preset ReleaseCPU
|
||||
cmake --build build/ReleaseCPU
|
||||
|
||||
./build/ReleaseCPU/bin/llama-simple -m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf -n 10 "Hello, my name is "
|
||||
```
|
||||
|
||||
## Notes about GPU-accelerated backends
|
||||
|
||||
The GPU may still be used to accelerate some parts of the computation even when using the `-ngl 0` option. You can fully disable GPU acceleration by using `--device none`.
|
||||
|
|
|
|||
|
|
@ -246,6 +246,10 @@ set (GGML_SYCL_TARGET "INTEL" CACHE STRING
|
|||
set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
|
||||
"ggml: sycl device architecture")
|
||||
|
||||
option(GGML_OPENVINO "ggml: use OPENVINO" OFF)
|
||||
option(GGML_OPENVINO_DEBUG "ggml: enable OPENVINO debugging" OFF)
|
||||
option(GGML_OV_FRONTEND "ggml: OPENVINO frontend path" ON)
|
||||
|
||||
option(GGML_OPENCL "ggml: use OpenCL" OFF)
|
||||
option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increases overhead)" OFF)
|
||||
option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" ON)
|
||||
|
|
@ -324,6 +328,7 @@ set(GGML_PUBLIC_HEADERS
|
|||
include/ggml-vulkan.h
|
||||
include/ggml-webgpu.h
|
||||
include/ggml-zendnn.h
|
||||
include/ggml-openvino.h
|
||||
include/gguf.h)
|
||||
|
||||
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
||||
|
|
|
|||
|
|
@ -458,6 +458,7 @@ ggml_add_backend(zDNN)
|
|||
ggml_add_backend(OpenCL)
|
||||
ggml_add_backend(Hexagon)
|
||||
ggml_add_backend(ZenDNN)
|
||||
ggml_add_backend(OPENVINO)
|
||||
|
||||
foreach (target ggml-base ggml)
|
||||
target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
|
||||
|
|
|
|||
|
|
@ -77,6 +77,10 @@
|
|||
#include "ggml-zendnn.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_OPENVINO
|
||||
#include "ggml-openvino.h"
|
||||
#endif
|
||||
|
||||
// disable C++17 deprecation warning for std::codecvt_utf8
|
||||
#if defined(__clang__)
|
||||
# pragma clang diagnostic push
|
||||
|
|
@ -222,6 +226,9 @@ struct ggml_backend_registry {
|
|||
#ifdef GGML_USE_RPC
|
||||
register_backend(ggml_backend_rpc_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_OPENVINO
|
||||
register_backend(ggml_backend_openvino_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_CPU
|
||||
register_backend(ggml_backend_cpu_reg());
|
||||
#endif
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,42 @@
|
|||
find_package(OpenVINO REQUIRED)
|
||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE openvino::runtime)
|
||||
|
||||
# Set header and libs
|
||||
file(GLOB GGML_HEADERS_OPENVINO "ggml-openvino/*.h")
|
||||
list(APPEND GGML_HEADERS_OPENVINO "../include/ggml-openvino.h")
|
||||
file(GLOB GGML_SOURCES_OPENVINO "ggml-openvino/*.cpp")
|
||||
list(APPEND GGML_SOURCES_OPENVINO "ggml-openvino.cpp")
|
||||
|
||||
list(APPEND GGML_CDEF_PUBLIC GGML_USE_OPENVINO)
|
||||
|
||||
if (OPENVINO_DEVICE)
|
||||
if (OPENVINO_DEVICE STREQUAL "GPU")
|
||||
add_compile_definitions(GGML_OPENVINO_GPU)
|
||||
elseif (OPENVINO_DEVICE STREQUAL "NPU")
|
||||
add_compile_definitions(GGML_OPENVINO_NPU)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(NOT DEFINED GGML_OV_FRONTEND)
|
||||
set(GGML_OV_FRONTEND OpenVINO_DIR)
|
||||
endif()
|
||||
add_definitions(-DGGML_OV_FRONTEND="${GGML_OV_FRONTEND}")
|
||||
|
||||
if (OpenVINO_DIR)
|
||||
if (GGML_OPENVINO)
|
||||
if (NOT UNIX)
|
||||
set(GGML_OPENVINO OFF)
|
||||
message(WARNING "OpenVINO: OpenVINO toolkit supports unix but not ${CMAKE_SYSTEM_NAME}. Turning off GGML_OPENVINO")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (GGML_OPENVINO)
|
||||
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
|
||||
elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64")
|
||||
else()
|
||||
set(GGML_OPENVINO OFF)
|
||||
message(WARNING "OpenVINO: OpenVINO toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}. Turning off GGML_OPENVINO")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
endif()
|
||||
|
|
@ -1,9 +1,8 @@
|
|||
#pragma once
|
||||
|
||||
#include <map>
|
||||
|
||||
#include "openvino/core/node.hpp"
|
||||
#include "openvino/frontend/decoder.hpp"
|
||||
#include <openvino/core/node.hpp>
|
||||
#include <openvino/frontend/decoder.hpp>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
|
|
@ -43,11 +42,7 @@ public:
|
|||
|
||||
virtual std::string& get_output_name(size_t index) const = 0;
|
||||
|
||||
virtual size_t get_output_size() const = 0;
|
||||
|
||||
virtual bool is_graph_output(size_t index) const = 0;
|
||||
|
||||
virtual std::string& get_output_name(size_t index) const = 0;
|
||||
virtual std::vector<std::string> get_output_names() const = 0;
|
||||
|
||||
virtual const std::string& get_op_type() const = 0;
|
||||
|
||||
|
|
@ -65,4 +60,4 @@ public:
|
|||
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
} // namespace ov
|
||||
|
|
@ -354,7 +354,7 @@ std::vector<size_t> GgmlOvDecoder::get_shape(const ggml_tensor* tensor) {
|
|||
|
||||
std::vector<size_t> GgmlOvDecoder::get_stride(const ggml_tensor* tensor) {
|
||||
std::vector<size_t> stride;
|
||||
for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) {
|
||||
for (int i = GGML_MAX_DIMS - 2; i >= 0; --i) {
|
||||
stride.push_back(static_cast<size_t>(tensor->nb[i]));
|
||||
}
|
||||
return stride;
|
||||
|
|
@ -448,27 +448,16 @@ void GgmlOvDecoder::visit_subgraph(std::function<void(std::shared_ptr<GgmlDecode
|
|||
|
||||
const std::string& GgmlOvDecoder::get_op_type() const {
|
||||
static const std::map<ggml_op, std::string> opTypeMap = {
|
||||
{GGML_OP_ACC, "GGML_OP_ACC"},
|
||||
{GGML_OP_ADD, "GGML_OP_ADD"},
|
||||
{GGML_OP_ADD1, "GGML_OP_ADD1"},
|
||||
{GGML_OP_CONT, "GGML_OP_CONT"},
|
||||
{GGML_OP_CPY, "GGML_OP_CPY"},
|
||||
{GGML_OP_DIV, "GGML_OP_DIV"},
|
||||
{GGML_OP_DUP, "GGML_OP_DUP"},
|
||||
{GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS"},
|
||||
{GGML_OP_MUL, "GGML_OP_MUL"},
|
||||
{GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT"},
|
||||
{GGML_OP_PERMUTE, "GGML_OP_PERMUTE"},
|
||||
{GGML_OP_RESHAPE, "GGML_OP_RESHAPE"},
|
||||
{GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM"},
|
||||
{GGML_OP_ROPE, "GGML_OP_ROPE"},
|
||||
{GGML_OP_SCALE, "GGML_OP_SCALE"},
|
||||
{GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX"},
|
||||
{GGML_OP_SUB, "GGML_OP_SUB"},
|
||||
{GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE"},
|
||||
{GGML_OP_UNARY, "GGML_OP_UNARY"},
|
||||
{GGML_OP_VIEW, "GGML_OP_VIEW"}
|
||||
};
|
||||
{GGML_OP_ACC, "GGML_OP_ACC"}, {GGML_OP_ADD, "GGML_OP_ADD"},
|
||||
{GGML_OP_ADD1, "GGML_OP_ADD1"}, {GGML_OP_CONT, "GGML_OP_CONT"},
|
||||
{GGML_OP_CPY, "GGML_OP_CPY"}, {GGML_OP_DIV, "GGML_OP_DIV"},
|
||||
{GGML_OP_DUP, "GGML_OP_DUP"}, {GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS"},
|
||||
{GGML_OP_MUL, "GGML_OP_MUL"}, {GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT"},
|
||||
{GGML_OP_PERMUTE, "GGML_OP_PERMUTE"}, {GGML_OP_RESHAPE, "GGML_OP_RESHAPE"},
|
||||
{GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM"}, {GGML_OP_ROPE, "GGML_OP_ROPE"},
|
||||
{GGML_OP_SCALE, "GGML_OP_SCALE"}, {GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX"},
|
||||
{GGML_OP_SUB, "GGML_OP_SUB"}, {GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE"},
|
||||
{GGML_OP_UNARY, "GGML_OP_UNARY"}, {GGML_OP_VIEW, "GGML_OP_VIEW"}};
|
||||
static const std::map<ggml_unary_op, std::string> unaryOpTypeMap = {
|
||||
{GGML_UNARY_OP_ABS, "GGML_UNARY_OP_ABS"},
|
||||
{GGML_UNARY_OP_SGN, "GGML_UNARY_OP_SGN"},
|
||||
|
|
@ -484,8 +473,7 @@ const std::string& GgmlOvDecoder::get_op_type() const {
|
|||
{GGML_UNARY_OP_HARDSWISH, "GGML_UNARY_OP_HARDSWISH"},
|
||||
{GGML_UNARY_OP_HARDSIGMOID, "GGML_UNARY_OP_HARDSIGMOID"},
|
||||
{GGML_UNARY_OP_EXP, "GGML_UNARY_OP_EXP"},
|
||||
{GGML_UNARY_OP_COUNT, "GGML_UNARY_OP_COUNT"}
|
||||
};
|
||||
{GGML_UNARY_OP_COUNT, "GGML_UNARY_OP_COUNT"}};
|
||||
auto it = opTypeMap.find(m_node->op);
|
||||
if (it != opTypeMap.end()) {
|
||||
if (it->first == GGML_OP_UNARY) {
|
||||
|
|
@ -498,4 +486,4 @@ const std::string& GgmlOvDecoder::get_op_type() const {
|
|||
}
|
||||
static const std::string unknown_op = "UNKNOWN_OP";
|
||||
return unknown_op;
|
||||
}
|
||||
}
|
||||
|
|
@ -53,11 +53,7 @@ public:
|
|||
|
||||
virtual std::string& get_output_name(size_t index) const override;
|
||||
|
||||
virtual size_t get_output_size() const override;
|
||||
|
||||
virtual bool is_graph_output(size_t index) const override;
|
||||
|
||||
virtual std::string& get_output_name(size_t index) const override;
|
||||
virtual std::vector<std::string> get_output_names() const override;
|
||||
|
||||
virtual const std::string& get_op_type() const override;
|
||||
|
||||
|
|
@ -105,10 +101,10 @@ private:
|
|||
void set_max_token_len();
|
||||
int64_t m_max_token_len;
|
||||
|
||||
struct ggml_cgraph * m_cgraph;
|
||||
std::map<std::string, ggml_tensor *> m_inputs;
|
||||
struct ggml_cgraph* m_cgraph;
|
||||
std::map<std::string, ggml_tensor*> m_inputs;
|
||||
std::vector<std::string> m_input_names;
|
||||
std::map<std::string, ggml_tensor *> m_outputs;
|
||||
std::map<std::string, ggml_tensor*> m_outputs;
|
||||
std::vector<std::string> m_output_names;
|
||||
ggml_tensor* m_node;
|
||||
std::vector<ggml_tensor*> m_nodes;
|
||||
|
|
@ -123,4 +119,4 @@ private:
|
|||
std::vector<std::string> m_model_output_names;
|
||||
};
|
||||
|
||||
void print_tensor_address_map(const struct ggml_cgraph* cgraph);
|
||||
void print_tensor_address_map(const struct ggml_cgraph* cgraph);
|
||||
|
|
@ -42,12 +42,7 @@ std::map<std::string, void*> get_ggml_graph_output_dst(std::shared_ptr<GgmlOvDec
|
|||
|
||||
static ov::frontend::FrontEnd::Ptr get_ggml_frontend() {
|
||||
auto fem = ov::frontend::FrontEndManager();
|
||||
std::string fe_so_path;
|
||||
#ifdef GGML_OV_FRONTEND
|
||||
fe_so_path = GGML_OV_FRONTEND;
|
||||
#endif
|
||||
fem.register_front_end("ggml", fe_so_path);
|
||||
front_end = fem.load_by_framework("ggml");
|
||||
auto front_end = fem.load_by_framework("ggml");
|
||||
return front_end;
|
||||
}
|
||||
|
||||
|
|
@ -204,4 +199,4 @@ void print_output_tensor_info(const std::string& name,
|
|||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
#include "ggml-decoder.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-decoder.h"
|
||||
|
||||
enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph);
|
||||
|
||||
|
|
@ -9,4 +9,4 @@ void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor);
|
|||
|
||||
void print_output_tensor_info(const std::string& name,
|
||||
const ov::Tensor& tensor,
|
||||
std::map<std::string, void*>& output_dst);
|
||||
std::map<std::string, void*>& output_dst);
|
||||
Loading…
Reference in New Issue